From f1fec7fc098702eaf566260d916fc5e8582f3e77 Mon Sep 17 00:00:00 2001 From: iochocki Date: Tue, 12 Sep 2023 13:39:59 +0200 Subject: [PATCH 1/9] Add sample --- .../01_sycl_dpct_output/Common/helper_cuda.h | 1022 +++++++++++++++++ .../Common/helper_string.h | 428 +++++++ .../src/radixSortThrust.dp.cpp | 258 +++++ .../02_sycl_dpct_migrated/CMakeLists.txt | 5 + .../Common/helper_cuda.h | 1022 +++++++++++++++++ .../Common/helper_string.h | 428 +++++++ .../02_sycl_dpct_migrated/src/CMakeLists.txt | 5 + .../src/radixSortMigrated.cpp | 239 ++++ .../radix_sort_thrust_migrated/CMakeLists.txt | 16 + 9 files changed, 3423 insertions(+) create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_cuda.h create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_string.h create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/src/radixSortThrust.dp.cpp create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/CMakeLists.txt create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_string.h create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/CMakeLists.txt create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/radixSortMigrated.cpp create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/CMakeLists.txt diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_cuda.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_cuda.h new file mode 100644 index 0000000000..bc9e302c52 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_cuda.h @@ -0,0 +1,1022 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "helper_string.h" + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DPCT_HPP__ +static const char *_cudaGetErrorEnum(dpct::err0 error) { + /* + DPCT1009:0: SYCL uses exceptions to report errors and does not use the error + codes. The original code was commented out and a warning string was inserted. + You need to rewrite this code. + */ + return "cudaGetErrorName is not supported" /*cudaGetErrorName(error)*/; +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(int error) { + switch (error) { + case 0: + return "CURAND_STATUS_SUCCESS"; + + case 100: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case 101: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case 102: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case 103: + return "CURAND_STATUS_TYPE_ERROR"; + + case 104: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case 105: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case 106: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case 201: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case 202: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case 203: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case 204: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case 999: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { +} + +#ifdef __DPCT_HPP__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:1: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 err = 0; +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:3: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 err = 0; +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct dpct_type_624496 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {0x90, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct dpct_type_942342 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {0x87, "Ampere"}, + {0x90, "Hopper"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} + // end of GPU Architecture definitions + +#ifdef __DPCT_HPP__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:5: All SYCL devices can be used by the host to submit tasks. You may + need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors(DPCT_CHECK_ERROR( + major = dpct::dev_mgr::instance().get_device(devID).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version())); + /* + DPCT1035:6: All SYCL devices can be used by the host to submit tasks. You may + need to adjust this code. + */ + if (computeMode == 0) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + /* + DPCT1093:7: The "devID" device may be not the one intended for use. Adjust the + selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(devID))); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() try { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:8: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors(DPCT_CHECK_ERROR(major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR(minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version())); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + /* + DPCT1035:9: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + if (computeMode != 0) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(major, minor); + } + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors( + DPCT_CHECK_ERROR(multiProcessorCount = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_compute_units())); + dpct::err0 result = + DPCT_CHECK_ERROR(clockRate = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_clock_frequency()); + + uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + /* + DPCT1093:10: The "devID" device may be not the one intended for use. Adjust + the selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(devID))); + int major = 0, minor = 0; + checkCudaErrors(DPCT_CHECK_ERROR( + major = + dpct::dev_mgr::instance().get_device(devID).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = + dpct::dev_mgr::instance().get_device(devID).get_minor_version())); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, _ConvertSMVer2ArchName(major, minor), major, minor); + + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + /* + DPCT1035:11: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors( + DPCT_CHECK_ERROR(integrated = dpct::dev_mgr::instance() + .get_device(current_device) + .get_integrated())); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + /* + DPCT1035:12: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + if (integrated && (computeMode != 0)) { + /* + DPCT1093:13: The "current_device" device may be not the one intended for + use. Adjust the selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(current_device))); + + int major = 0, minor = 0; + checkCudaErrors(DPCT_CHECK_ERROR(major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR(minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version())); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(dev = dpct::dev_mgr::instance().current_device_id()); + checkCudaErrors(DPCT_CHECK_ERROR( + major = dpct::dev_mgr::instance().get_device(dev).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = dpct::dev_mgr::instance().get_device(dev).get_minor_version())); + + if ((major > major_version) || + (major == major_version && + minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_string.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_string.h new file mode 100644 index 0000000000..47fb1ac1fa --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_string.h @@ -0,0 +1,428 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/src/radixSortThrust.dp.cpp b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/src/radixSortThrust.dp.cpp new file mode 100644 index 0000000000..e1c4ae91fe --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/src/radixSortThrust.dp.cpp @@ -0,0 +1,258 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include "helper_cuda.h" + +#include +#include +#include +#include + +template bool testSort(int argc, char **argv) try { + int cmdVal; + int keybits = 32; + + unsigned int numElements = 1048576; + bool keysOnly = checkCmdLineFlag(argc, (const char **)argv, "keysonly"); + bool quiet = checkCmdLineFlag(argc, (const char **)argv, "quiet"); + + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + numElements = cmdVal; + + if (cmdVal < 0) { + printf("Error: elements must be > 0, elements=%d is invalid\n", cmdVal); + exit(EXIT_SUCCESS); + } + } + + if (checkCmdLineFlag(argc, (const char **)argv, "keybits")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "keybits"); + keybits = cmdVal; + + if (keybits <= 0) { + printf("Error: keybits must be > 0, keybits=%d is invalid\n", keybits); + exit(EXIT_SUCCESS); + } + } + + unsigned int numIterations = (numElements >= 16777216) ? 10 : 100; + + if (checkCmdLineFlag(argc, (const char **)argv, "iterations")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "iterations"); + numIterations = cmdVal; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Command line:\nradixSortThrust [-option]\n"); + printf("Valid options:\n"); + printf("-n= : number of elements to sort\n"); + printf("-keybits=bits : keybits must be > 0\n"); + printf( + "-keysonly : only sort an array of keys (default sorts key-value " + "pairs)\n"); + printf( + "-float : use 32-bit float keys (default is 32-bit unsigned " + "int)\n"); + printf( + "-quiet : Output only the number of elements and the time to " + "sort\n"); + printf("-help : Output a help message\n"); + exit(EXIT_SUCCESS); + } + + if (!quiet) + printf("\nSorting %d %d-bit %s keys %s\n\n", numElements, keybits, + floatKeys ? "float" : "unsigned int", + keysOnly ? "(only)" : "and values"); + + int deviceID = -1; + + if (0 == deviceID = dpct::dev_mgr::instance().current_device_id()) { + dpct::device_info devprop; + dpct::dev_mgr::instance().get_device(deviceID).get_device_info(devprop); + unsigned int totalMem = (keysOnly ? 2 : 4) * numElements * sizeof(T); + + if (devprop.get_global_mem_size() < totalMem) { + printf("Error: insufficient amount of memory to sort %d elements.\n", + numElements); + printf("%d bytes needed, %d bytes available\n", (int)totalMem, + (int)devprop.get_global_mem_size()); + exit(EXIT_SUCCESS); + } + } + + std::vector h_keys(numElements); + std::vector h_keysSorted(numElements); + std::vector h_values; + + if (!keysOnly) h_values = std::vector(numElements); + + // Fill up with some random data + /* + DPCT1008:14: clock function is not defined in SYCL. This is a + hardware-specific feature. Consult with your hardware vendor to find a + replacement. + */ + thrust::default_random_engine rng(clock()); + + if (floatKeys) { + thrust::uniform_real_distribution u01(0, 1); + + for (int i = 0; i < (int)numElements; i++) h_keys[i] = u01(rng); + } else { + thrust::uniform_int_distribution u(0, UINT_MAX); + + for (int i = 0; i < (int)numElements; i++) h_keys[i] = u(rng); + } + + if (!keysOnly) + dpct::iota(oneapi::dpl::execution::seq, h_values.begin(), h_values.end()); + + // Copy data onto the GPU + dpct::device_vector d_keys; + dpct::device_vector d_values; + + // run multiple iterations to compute an average sort time + dpct::event_ptr start_event, stop_event; + std::chrono::time_point start_event_ct1; + std::chrono::time_point stop_event_ct1; + checkCudaErrors(DPCT_CHECK_ERROR(start_event = new sycl::event())); + checkCudaErrors(DPCT_CHECK_ERROR(stop_event = new sycl::event())); + + float totalTime = 0; + + for (unsigned int i = 0; i < numIterations; i++) { + // reset data before sort + d_keys = h_keys; + + if (!keysOnly) d_values = h_values; + + /* + DPCT1012:15: Detected kernel execution time measurement pattern and + generated an initial code for time measurements in SYCL. You can change the + way time is measured depending on your goals. + */ + /* + DPCT1024:16: The original code returned the error code that was further + consumed by the program logic. This original code was replaced with 0. You + may need to rewrite the program logic consuming the error code. + */ + start_event_ct1 = std::chrono::steady_clock::now(); + checkCudaErrors(0); + + if (keysOnly) + oneapi::dpl::sort( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end()); + else + dpct::sort( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end(), d_values.begin()); + + /* + DPCT1012:17: Detected kernel execution time measurement pattern and + generated an initial code for time measurements in SYCL. You can change the + way time is measured depending on your goals. + */ + /* + DPCT1024:18: The original code returned the error code that was further + consumed by the program logic. This original code was replaced with 0. You + may need to rewrite the program logic consuming the error code. + */ + stop_event_ct1 = std::chrono::steady_clock::now(); + checkCudaErrors(0); + checkCudaErrors(0); + + float time = 0; + checkCudaErrors( + DPCT_CHECK_ERROR((time = std::chrono::duration( + stop_event_ct1 - start_event_ct1) + .count()))); + totalTime += time; + } + + totalTime /= (1.0e3f * numIterations); + printf( + "radixSortThrust, Throughput = %.4f MElements/s, Time = %.5f s, Size = " + "%u elements\n", + 1.0e-6f * numElements / totalTime, totalTime, numElements); + + getLastCudaError("after radixsort"); + + // Get results back to host for correctness checking + std::copy( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end(), h_keysSorted.begin()); + + if (!keysOnly) + std::copy( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_values.begin(), d_values.end(), h_values.begin()); + + getLastCudaError("copying results to host memory"); + + // Check results + bool bTestResult = oneapi::dpl::is_sorted( + oneapi::dpl::execution::seq, h_keysSorted.begin(), h_keysSorted.end()); + + checkCudaErrors(DPCT_CHECK_ERROR(dpct::destroy_event(start_event))); + checkCudaErrors(DPCT_CHECK_ERROR(dpct::destroy_event(stop_event))); + + if (!bTestResult && !quiet) { + return false; + } + + return bTestResult; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +int main(int argc, char **argv) { + // Start logs + printf("%s Starting...\n\n", argv[0]); + + findCudaDevice(argc, (const char **)argv); + + bool bTestResult = false; + + if (checkCmdLineFlag(argc, (const char **)argv, "float")) + bTestResult = testSort(argc, argv); + else + bTestResult = testSort(argc, argv); + + printf(bTestResult ? "Test passed\n" : "Test failed!\n"); +} \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/CMakeLists.txt b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/CMakeLists.txt new file mode 100644 index 0000000000..1b0edab6d7 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/CMakeLists.txt @@ -0,0 +1,5 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -std=c++17") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lmkl_sycl -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core") + +include_directories(${CMAKE_SOURCE_DIR}/02_sycl_dpct_migrated/Common/) +add_subdirectory("src") \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h new file mode 100644 index 0000000000..bc9e302c52 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h @@ -0,0 +1,1022 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "helper_string.h" + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DPCT_HPP__ +static const char *_cudaGetErrorEnum(dpct::err0 error) { + /* + DPCT1009:0: SYCL uses exceptions to report errors and does not use the error + codes. The original code was commented out and a warning string was inserted. + You need to rewrite this code. + */ + return "cudaGetErrorName is not supported" /*cudaGetErrorName(error)*/; +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(int error) { + switch (error) { + case 0: + return "CURAND_STATUS_SUCCESS"; + + case 100: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case 101: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case 102: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case 103: + return "CURAND_STATUS_TYPE_ERROR"; + + case 104: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case 105: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case 106: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case 201: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case 202: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case 203: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case 204: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case 999: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { +} + +#ifdef __DPCT_HPP__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:1: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 err = 0; +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:3: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 err = 0; +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct dpct_type_624496 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {0x90, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct dpct_type_942342 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {0x87, "Ampere"}, + {0x90, "Hopper"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} + // end of GPU Architecture definitions + +#ifdef __DPCT_HPP__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:5: All SYCL devices can be used by the host to submit tasks. You may + need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors(DPCT_CHECK_ERROR( + major = dpct::dev_mgr::instance().get_device(devID).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version())); + /* + DPCT1035:6: All SYCL devices can be used by the host to submit tasks. You may + need to adjust this code. + */ + if (computeMode == 0) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + /* + DPCT1093:7: The "devID" device may be not the one intended for use. Adjust the + selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(devID))); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() try { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:8: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors(DPCT_CHECK_ERROR(major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR(minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version())); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + /* + DPCT1035:9: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + if (computeMode != 0) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(major, minor); + } + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors( + DPCT_CHECK_ERROR(multiProcessorCount = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_compute_units())); + dpct::err0 result = + DPCT_CHECK_ERROR(clockRate = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_clock_frequency()); + + uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + /* + DPCT1093:10: The "devID" device may be not the one intended for use. Adjust + the selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(devID))); + int major = 0, minor = 0; + checkCudaErrors(DPCT_CHECK_ERROR( + major = + dpct::dev_mgr::instance().get_device(devID).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = + dpct::dev_mgr::instance().get_device(devID).get_minor_version())); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, _ConvertSMVer2ArchName(major, minor), major, minor); + + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + /* + DPCT1035:11: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors( + DPCT_CHECK_ERROR(integrated = dpct::dev_mgr::instance() + .get_device(current_device) + .get_integrated())); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + /* + DPCT1035:12: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + if (integrated && (computeMode != 0)) { + /* + DPCT1093:13: The "current_device" device may be not the one intended for + use. Adjust the selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(current_device))); + + int major = 0, minor = 0; + checkCudaErrors(DPCT_CHECK_ERROR(major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR(minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version())); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(dev = dpct::dev_mgr::instance().current_device_id()); + checkCudaErrors(DPCT_CHECK_ERROR( + major = dpct::dev_mgr::instance().get_device(dev).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = dpct::dev_mgr::instance().get_device(dev).get_minor_version())); + + if ((major > major_version) || + (major == major_version && + minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_string.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_string.h new file mode 100644 index 0000000000..47fb1ac1fa --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_string.h @@ -0,0 +1,428 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/CMakeLists.txt b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/CMakeLists.txt new file mode 100644 index 0000000000..233bda86e2 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(radixSortMigrated radixSortMigrated.cpp) + +target_link_libraries(radixSortMigrated OpenCL sycl) + +add_custom_target(run_radixSortMigrated radixSortMigrated) \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/radixSortMigrated.cpp b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/radixSortMigrated.cpp new file mode 100644 index 0000000000..811abc2d4b --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/radixSortMigrated.cpp @@ -0,0 +1,239 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +template bool testSort(int argc, char **argv) try { + int cmdVal; + int keybits = 32; + + unsigned int numElements = 1048576; + bool keysOnly = checkCmdLineFlag(argc, (const char **)argv, "keysonly"); + bool quiet = checkCmdLineFlag(argc, (const char **)argv, "quiet"); + + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + numElements = cmdVal; + + if (cmdVal < 0) { + printf("Error: elements must be > 0, elements=%d is invalid\n", cmdVal); + exit(EXIT_SUCCESS); + } + } + + if (checkCmdLineFlag(argc, (const char **)argv, "keybits")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "keybits"); + keybits = cmdVal; + + if (keybits <= 0) { + printf("Error: keybits must be > 0, keybits=%d is invalid\n", keybits); + exit(EXIT_SUCCESS); + } + } + + unsigned int numIterations = (numElements >= 16777216) ? 10 : 100; + + if (checkCmdLineFlag(argc, (const char **)argv, "iterations")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "iterations"); + numIterations = cmdVal; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Command line:\nradixSortThrust [-option]\n"); + printf("Valid options:\n"); + printf("-n= : number of elements to sort\n"); + printf("-keybits=bits : keybits must be > 0\n"); + printf( + "-keysonly : only sort an array of keys (default sorts key-value " + "pairs)\n"); + printf( + "-float : use 32-bit float keys (default is 32-bit unsigned " + "int)\n"); + printf( + "-quiet : Output only the number of elements and the time to " + "sort\n"); + printf("-help : Output a help message\n"); + exit(EXIT_SUCCESS); + } + + if (!quiet) + printf("\nSorting %d %d-bit %s keys %s\n\n", numElements, keybits, + floatKeys ? "float" : "unsigned int", + keysOnly ? "(only)" : "and values"); + + int deviceID = -1; + deviceID = dpct::dev_mgr::instance().current_device_id(); + + if (0 == deviceID) { + dpct::device_info devprop; + dpct::dev_mgr::instance().get_device(deviceID).get_device_info(devprop); + unsigned int totalMem = (keysOnly ? 2 : 4) * numElements * sizeof(T); + + if (devprop.get_global_mem_size() < totalMem) { + printf("Error: insufficient amount of memory to sort %d elements.\n", + numElements); + printf("%d bytes needed, %d bytes available\n", (int)totalMem, + (int)devprop.get_global_mem_size()); + exit(EXIT_SUCCESS); + } + } + + std::vector h_keys(numElements); + std::vector h_keysSorted(numElements); + std::vector h_values; + + if (!keysOnly) h_values = std::vector(numElements); + + // Fill up with some random data + oneapi::dpl::minstd_rand rng(clock()); + + if (floatKeys) { + oneapi::dpl::uniform_real_distribution u01(0, 1); + + for (int i = 0; i < (int)numElements; i++) h_keys[i] = u01(rng); + } else { + oneapi::dpl::uniform_int_distribution u(0, UINT_MAX); + + for (int i = 0; i < (int)numElements; i++) h_keys[i] = u(rng); + } + + if (!keysOnly) + dpct::iota(oneapi::dpl::execution::seq, h_values.begin(), h_values.end()); + + // Copy data onto the GPU + dpct::device_vector d_keys; + dpct::device_vector d_values; + + // run multiple iterations to compute an average sort time + dpct::event_ptr start_event, stop_event; + std::chrono::time_point start_event_ct1; + std::chrono::time_point stop_event_ct1; + start_event = new sycl::event(); + stop_event = new sycl::event(); + + float totalTime = 0; + + for (unsigned int i = 0; i < numIterations; i++) { + // reset data before sort + d_keys = h_keys; + + if (!keysOnly) d_values = h_values; + + start_event_ct1 = std::chrono::steady_clock::now(); + + if (keysOnly) + oneapi::dpl::sort( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end()); + else + dpct::sort( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end(), d_values.begin()); + + stop_event_ct1 = std::chrono::steady_clock::now(); + + float time = 0; + checkCudaErrors( + DPCT_CHECK_ERROR((time = std::chrono::duration( + stop_event_ct1 - start_event_ct1) + .count()))); + totalTime += time; + } + + totalTime /= (1.0e3f * numIterations); + printf( + "radixSortThrust, Throughput = %.4f MElements/s, Time = %.5f s, Size = " + "%u elements\n", + 1.0e-6f * numElements / totalTime, totalTime, numElements); + + getLastCudaError("after radixsort"); + + // Get results back to host for correctness checking + std::copy( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end(), h_keysSorted.begin()); + + if (!keysOnly) + std::copy( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_values.begin(), d_values.end(), h_values.begin()); + + getLastCudaError("copying results to host memory"); + + // Check results + bool bTestResult = oneapi::dpl::is_sorted( + oneapi::dpl::execution::seq, h_keysSorted.begin(), h_keysSorted.end()); + + dpct::destroy_event(start_event); + dpct::destroy_event(stop_event); + + if (!bTestResult && !quiet) { + return false; + } + + return bTestResult; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +int main(int argc, char **argv) { + time_t start, end; + time(&start); + // Start logs + printf("%s Starting...\n\n", argv[0]); + + findCudaDevice(argc, (const char **)argv); + + bool bTestResult = false; + + if (checkCmdLineFlag(argc, (const char **)argv, "float")) + bTestResult = testSort(argc, argv); + else + bTestResult = testSort(argc, argv); + + printf(bTestResult ? "Test passed\n" : "Test failed!\n"); + time(&end); + double time_taken = double(end - start); + std::cout << "Time taken by program is : " << std::fixed + << time_taken << std::setprecision(5); + std::cout << " sec " << std::endl; +} \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/CMakeLists.txt b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/CMakeLists.txt new file mode 100644 index 0000000000..b1d80bd355 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required (VERSION 3.5) + +set(CMAKE_CXX_COMPILER "icpx") + +project (radixSortThrustMigrated) +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +add_subdirectory (02_sycl_dpct_migrated) From 87ebec87c123e226ae15458fb03ed19c5b12d546 Mon Sep 17 00:00:00 2001 From: iochocki Date: Wed, 13 Sep 2023 14:18:05 +0200 Subject: [PATCH 2/9] Adjust helper_cuda file --- .../Common/helper_cuda.h | 44 ------------------- 1 file changed, 44 deletions(-) diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h index bc9e302c52..ec9f3a4e47 100644 --- a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h @@ -601,10 +601,6 @@ void check(T result, char const *const func, const char *const file, inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) { - /* - DPCT1010:1: SYCL uses exceptions to report errors and does not use the error - codes. The call was replaced with 0. You need to rewrite this code. - */ dpct::err0 err = 0; } @@ -614,10 +610,6 @@ inline void __getLastCudaError(const char *errorMessage, const char *file, inline void __printLastCudaError(const char *errorMessage, const char *file, const int line) { - /* - DPCT1010:3: SYCL uses exceptions to report errors and does not use the error - codes. The call was replaced with 0. You need to rewrite this code. - */ dpct::err0 err = 0; } #endif @@ -761,19 +753,11 @@ inline int gpuDeviceInit(int devID) { } int computeMode = -1, major = 0, minor = 0; - /* - DPCT1035:5: All SYCL devices can be used by the host to submit tasks. You may - need to adjust this code. - */ checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); checkCudaErrors(DPCT_CHECK_ERROR( major = dpct::dev_mgr::instance().get_device(devID).get_major_version())); checkCudaErrors(DPCT_CHECK_ERROR( minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version())); - /* - DPCT1035:6: All SYCL devices can be used by the host to submit tasks. You may - need to adjust this code. - */ if (computeMode == 0) { fprintf(stderr, "Error: device is running in Date: Wed, 13 Sep 2023 14:21:51 +0200 Subject: [PATCH 3/9] Add intitial documentation --- .../radix_sort_thrust_migrated/License.txt | 7 + .../radix_sort_thrust_migrated/README.md | 178 ++++++++++++++++++ .../radix_sort_thrust_migrated/sample.json | 29 +++ 3 files changed, 214 insertions(+) create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/License.txt create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/License.txt b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/License.txt new file mode 100644 index 0000000000..80f3e07572 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md new file mode 100644 index 0000000000..5be873dbc6 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md @@ -0,0 +1,178 @@ +# `cuRAND Migration` Sample + +The `cuRAND Migration` sample is a collection of code samples that demonstrate the cuBLAS equivalent in Intel® oneAPI Math Kernel Library (oneMKL). + +| Area | Description +|:--- |:--- +| What you will learn | How to begin migrating CUDA code to a SYCL*-compliant equivalent +| Time to complete | 30 minutes +| Category | Code Optimization + +For more information on oneMKL and complete documentation of all oneMKL routines, see https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-documentation.html. + +## Purpose + +The samples source code using SYCL were migrated from CUDA source code for offloading computations to a GPU/CPU. The sample demonstrates how to migrate code to SYCL, optimize the migration steps, and improve processing time. + +Each of the cuRAND samples source files shows the usage of different oneMKL cuRAND routines. All are basic programs containing the usage of a single method of generating pseudorandom numbers. + +>**Note**: This sample is based on the [*cuRAND Library - APIs Examples*](https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuRAND) samples in the NVIDIA/CUDALibrary GitHub repository. + +## Prerequisites + +| Optimized for | Description +|:--- |:--- +| OS | Ubuntu* 20.04 +| Hardware | 10th Gen Intel® processors or newer +| Software | Intel® oneAPI DPC++/C++ Compiler + +## Key Implementation Details + +This sample contains two sets of sources in the following folders: + +| Folder Name | Description +|:--- |:--- +| `01_sycl_dpct_output` | Contains output of Intel® DPC++ Compatibility Tool used to migrate SYCL-compliant code from CUDA code.
This SYCL code has some unmigrated or incorrectly generated code that has to be manually fixed before it is functional. (The code does not work as supplied.) +| `02_sycl_dpct_migrated` | Contains SYCL to CUDA migrated code generated by using the Intel® DPC++ Compatibility Tool with the manual changes implemented to make the code fully functional. + +These functions are classified into eight different directories, each based on a RNG engine. There are **48** samples: + +## Set Environment Variables + +When working with the command-line interface (CLI), you should configure the oneAPI toolkits using environment variables. Set up your CLI environment by sourcing the `setvars` script every time you open a new terminal window. This practice ensures that your compiler, libraries, and tools are ready for development. + +## Build the `cuRAND Migration` Sample + +> **Note**: If you have not already done so, set up your CLI +> environment by sourcing the `setvars` script in the root of your oneAPI installation. +> +> Linux*: +> - For system wide installations: `. /opt/intel/oneapi/setvars.sh` +> - For private installations: ` . ~/intel/oneapi/setvars.sh` +> - For non-POSIX shells, like csh, use the following command: `bash -c 'source /setvars.sh ; exec csh'` +> +> For more information on configuring environment variables, see *[Use the setvars Script with Linux* or macOS*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/oneapi-development-environment-setup/use-the-setvars-script-with-linux-or-macos.html)*. + +### On Linux* + +1. Change to the sample directory. +2. Build the samples. + ``` + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + + By default, this command sequence builds the version of the source code in the `02_sycl_dpct_migrated` folder. + +#### Troubleshooting + +If an error occurs, you can get more details by running `make` with +the `VERBOSE=1` argument: +``` +make VERBOSE=1 +``` +If you receive an error message, troubleshoot the problem using the **Diagnostics Utility for Intel® oneAPI Toolkits**. The diagnostic utility provides configuration and system checks to help find missing dependencies, permissions errors, and other issues. See the [Diagnostics Utility for Intel® oneAPI Toolkits User Guide](https://www.intel.com/content/www/us/en/develop/documentation/diagnostic-utility-user-guide/top.html) for more information on using the utility. + + +## Run the `cuRAND Migration` Sample + +### On Linux + +Run the programs on a CPU or GPU. Each sample uses a default device, which in most cases is a GPU. + +1. Run the samples in the `02_sycl_dpct_migrated` folder. + ``` + make run_mt19937_uniform + ``` + +### Build and Run the `cuRAND Migration` Sample in Intel® DevCloud (Optional) + +When running a sample in the Intel® DevCloud, you must specify the compute node (CPU, GPU, FPGA) and whether to run in batch or interactive mode. For more information, see the Intel® oneAPI Base Toolkit [Get Started Guide](https://devcloud.intel.com/oneapi/get_started/). + +#### Build and Run Samples in Batch Mode (Optional) + +You can submit build and run jobs through a Portable Bash Script (PBS). A job is a script that submitted to PBS through the `qsub` utility. By default, the `qsub` utility does not inherit the current environment variables or your current working directory, so you might need to submit jobs to configure the environment variables. To indicate the correct working directory, you can use either absolute paths or pass the `-d \` option to `qsub`. + +1. Open a terminal on a Linux* system. +2. Log in to Intel® DevCloud. + ``` + ssh devcloud + ``` +3. Download the samples. + ``` + git clone https://github.com/oneapi-src/oneAPI-samples.git + ``` +4. Change to the sample directory. +5. Configure the sample for a GPU node and choose the backend as OpenCL. + ``` + qsub -I -l nodes=1:gpu:ppn=2 -d . + export SYCL_DEVICE_FILTER=opencl:gpu + ``` + - `-I` (upper case I) requests an interactive session. + - `-l nodes=1:gpu:ppn=2` (lower case L) assigns one full GPU node. + - `-d .` makes the current folder as the working directory for the task. + + |Available Nodes |Command Options + |:--- |:--- + | GPU |`qsub -l nodes=1:gpu:ppn=2 -d .` + | CPU |`qsub -l nodes=1:xeon:ppn=2 -d .` + +6. Perform build steps as you would on Linux. +7. Run the programs. +8. Clean up the project files. + ``` + make clean + ``` +9. Disconnect from the Intel® DevCloud. + ``` + exit + ``` + +## Example Output + +This is example output if you built the default and ran `run_mt19937_uniform`. + +``` +Scanning dependencies of target mt19937_uniform +[ 50%] Building CXX object 02_sycl_dpct_migrated/mt19937/CMakeFiles/mt19937_uniform.dir/mt19937_uniform.cpp.o +[100%] Linking CXX executable ../../bin/mt19937_uniform +[100%] Built target mt19937_uniform +Host +0.966454 +0.778166 +0.440733 +0.116851 +0.007491 +0.090644 +0.910976 +0.942535 +0.939269 +0.807002 +0.582228 +0.034926 +===== +Device +0.966454 +0.778166 +0.440733 +0.116851 +0.007491 +0.090644 +0.910976 +0.942535 +0.939269 +0.807002 +0.582228 +0.034926 +===== +[100%] Built target run_mt19937_uniform +``` + +## License + +Code samples are licensed under the MIT license. See +[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details. + +Third party program licenses are at [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt). diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json new file mode 100644 index 0000000000..a43dd2bace --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "E3626FAB-DCD8-465F-A4E7-BF4A858D6583", + "name": "Jacobi Iterative Solver", + "categories": ["Toolkit/oneAPI Direct Programming/C++SYCL/Dense Linear Algebra"], + "description": "Jacobi Iterative Solver provides step by step instructions for CPU, GPU and multiple GPU offload", + "toolchain": ["dpcpp"], + "os": ["linux"], + "targetDevice": ["CPU", "GPU"], + "gpuRequired": ["gen11"], + "builder": ["cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run_1_cpu", + "make run_2_gpu", + "make run_3_multi_gpu" + ] + } + ] + }, + "expertise": "Code Optimization" + } + \ No newline at end of file From 914d772d46fbfb950a5727ae844ba2a8a561d96a Mon Sep 17 00:00:00 2001 From: iochocki Date: Tue, 12 Sep 2023 13:39:59 +0200 Subject: [PATCH 4/9] Add sample --- .../01_sycl_dpct_output/Common/helper_cuda.h | 1022 +++++++++++++++++ .../Common/helper_string.h | 428 +++++++ .../src/radixSortThrust.dp.cpp | 258 +++++ .../02_sycl_dpct_migrated/CMakeLists.txt | 5 + .../Common/helper_cuda.h | 1022 +++++++++++++++++ .../Common/helper_string.h | 428 +++++++ .../02_sycl_dpct_migrated/src/CMakeLists.txt | 5 + .../src/radixSortMigrated.cpp | 239 ++++ .../radix_sort_thrust_migrated/CMakeLists.txt | 16 + 9 files changed, 3423 insertions(+) create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_cuda.h create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_string.h create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/src/radixSortThrust.dp.cpp create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/CMakeLists.txt create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_string.h create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/CMakeLists.txt create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/radixSortMigrated.cpp create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/CMakeLists.txt diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_cuda.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_cuda.h new file mode 100644 index 0000000000..bc9e302c52 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_cuda.h @@ -0,0 +1,1022 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "helper_string.h" + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DPCT_HPP__ +static const char *_cudaGetErrorEnum(dpct::err0 error) { + /* + DPCT1009:0: SYCL uses exceptions to report errors and does not use the error + codes. The original code was commented out and a warning string was inserted. + You need to rewrite this code. + */ + return "cudaGetErrorName is not supported" /*cudaGetErrorName(error)*/; +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(int error) { + switch (error) { + case 0: + return "CURAND_STATUS_SUCCESS"; + + case 100: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case 101: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case 102: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case 103: + return "CURAND_STATUS_TYPE_ERROR"; + + case 104: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case 105: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case 106: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case 201: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case 202: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case 203: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case 204: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case 999: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { +} + +#ifdef __DPCT_HPP__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:1: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 err = 0; +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:3: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 err = 0; +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct dpct_type_624496 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {0x90, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct dpct_type_942342 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {0x87, "Ampere"}, + {0x90, "Hopper"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} + // end of GPU Architecture definitions + +#ifdef __DPCT_HPP__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:5: All SYCL devices can be used by the host to submit tasks. You may + need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors(DPCT_CHECK_ERROR( + major = dpct::dev_mgr::instance().get_device(devID).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version())); + /* + DPCT1035:6: All SYCL devices can be used by the host to submit tasks. You may + need to adjust this code. + */ + if (computeMode == 0) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + /* + DPCT1093:7: The "devID" device may be not the one intended for use. Adjust the + selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(devID))); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() try { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:8: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors(DPCT_CHECK_ERROR(major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR(minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version())); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + /* + DPCT1035:9: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + if (computeMode != 0) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(major, minor); + } + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors( + DPCT_CHECK_ERROR(multiProcessorCount = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_compute_units())); + dpct::err0 result = + DPCT_CHECK_ERROR(clockRate = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_clock_frequency()); + + uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + /* + DPCT1093:10: The "devID" device may be not the one intended for use. Adjust + the selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(devID))); + int major = 0, minor = 0; + checkCudaErrors(DPCT_CHECK_ERROR( + major = + dpct::dev_mgr::instance().get_device(devID).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = + dpct::dev_mgr::instance().get_device(devID).get_minor_version())); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, _ConvertSMVer2ArchName(major, minor), major, minor); + + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + /* + DPCT1035:11: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors( + DPCT_CHECK_ERROR(integrated = dpct::dev_mgr::instance() + .get_device(current_device) + .get_integrated())); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + /* + DPCT1035:12: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + if (integrated && (computeMode != 0)) { + /* + DPCT1093:13: The "current_device" device may be not the one intended for + use. Adjust the selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(current_device))); + + int major = 0, minor = 0; + checkCudaErrors(DPCT_CHECK_ERROR(major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR(minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version())); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(dev = dpct::dev_mgr::instance().current_device_id()); + checkCudaErrors(DPCT_CHECK_ERROR( + major = dpct::dev_mgr::instance().get_device(dev).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = dpct::dev_mgr::instance().get_device(dev).get_minor_version())); + + if ((major > major_version) || + (major == major_version && + minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_string.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_string.h new file mode 100644 index 0000000000..47fb1ac1fa --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_string.h @@ -0,0 +1,428 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/src/radixSortThrust.dp.cpp b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/src/radixSortThrust.dp.cpp new file mode 100644 index 0000000000..e1c4ae91fe --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/src/radixSortThrust.dp.cpp @@ -0,0 +1,258 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include "helper_cuda.h" + +#include +#include +#include +#include + +template bool testSort(int argc, char **argv) try { + int cmdVal; + int keybits = 32; + + unsigned int numElements = 1048576; + bool keysOnly = checkCmdLineFlag(argc, (const char **)argv, "keysonly"); + bool quiet = checkCmdLineFlag(argc, (const char **)argv, "quiet"); + + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + numElements = cmdVal; + + if (cmdVal < 0) { + printf("Error: elements must be > 0, elements=%d is invalid\n", cmdVal); + exit(EXIT_SUCCESS); + } + } + + if (checkCmdLineFlag(argc, (const char **)argv, "keybits")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "keybits"); + keybits = cmdVal; + + if (keybits <= 0) { + printf("Error: keybits must be > 0, keybits=%d is invalid\n", keybits); + exit(EXIT_SUCCESS); + } + } + + unsigned int numIterations = (numElements >= 16777216) ? 10 : 100; + + if (checkCmdLineFlag(argc, (const char **)argv, "iterations")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "iterations"); + numIterations = cmdVal; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Command line:\nradixSortThrust [-option]\n"); + printf("Valid options:\n"); + printf("-n= : number of elements to sort\n"); + printf("-keybits=bits : keybits must be > 0\n"); + printf( + "-keysonly : only sort an array of keys (default sorts key-value " + "pairs)\n"); + printf( + "-float : use 32-bit float keys (default is 32-bit unsigned " + "int)\n"); + printf( + "-quiet : Output only the number of elements and the time to " + "sort\n"); + printf("-help : Output a help message\n"); + exit(EXIT_SUCCESS); + } + + if (!quiet) + printf("\nSorting %d %d-bit %s keys %s\n\n", numElements, keybits, + floatKeys ? "float" : "unsigned int", + keysOnly ? "(only)" : "and values"); + + int deviceID = -1; + + if (0 == deviceID = dpct::dev_mgr::instance().current_device_id()) { + dpct::device_info devprop; + dpct::dev_mgr::instance().get_device(deviceID).get_device_info(devprop); + unsigned int totalMem = (keysOnly ? 2 : 4) * numElements * sizeof(T); + + if (devprop.get_global_mem_size() < totalMem) { + printf("Error: insufficient amount of memory to sort %d elements.\n", + numElements); + printf("%d bytes needed, %d bytes available\n", (int)totalMem, + (int)devprop.get_global_mem_size()); + exit(EXIT_SUCCESS); + } + } + + std::vector h_keys(numElements); + std::vector h_keysSorted(numElements); + std::vector h_values; + + if (!keysOnly) h_values = std::vector(numElements); + + // Fill up with some random data + /* + DPCT1008:14: clock function is not defined in SYCL. This is a + hardware-specific feature. Consult with your hardware vendor to find a + replacement. + */ + thrust::default_random_engine rng(clock()); + + if (floatKeys) { + thrust::uniform_real_distribution u01(0, 1); + + for (int i = 0; i < (int)numElements; i++) h_keys[i] = u01(rng); + } else { + thrust::uniform_int_distribution u(0, UINT_MAX); + + for (int i = 0; i < (int)numElements; i++) h_keys[i] = u(rng); + } + + if (!keysOnly) + dpct::iota(oneapi::dpl::execution::seq, h_values.begin(), h_values.end()); + + // Copy data onto the GPU + dpct::device_vector d_keys; + dpct::device_vector d_values; + + // run multiple iterations to compute an average sort time + dpct::event_ptr start_event, stop_event; + std::chrono::time_point start_event_ct1; + std::chrono::time_point stop_event_ct1; + checkCudaErrors(DPCT_CHECK_ERROR(start_event = new sycl::event())); + checkCudaErrors(DPCT_CHECK_ERROR(stop_event = new sycl::event())); + + float totalTime = 0; + + for (unsigned int i = 0; i < numIterations; i++) { + // reset data before sort + d_keys = h_keys; + + if (!keysOnly) d_values = h_values; + + /* + DPCT1012:15: Detected kernel execution time measurement pattern and + generated an initial code for time measurements in SYCL. You can change the + way time is measured depending on your goals. + */ + /* + DPCT1024:16: The original code returned the error code that was further + consumed by the program logic. This original code was replaced with 0. You + may need to rewrite the program logic consuming the error code. + */ + start_event_ct1 = std::chrono::steady_clock::now(); + checkCudaErrors(0); + + if (keysOnly) + oneapi::dpl::sort( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end()); + else + dpct::sort( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end(), d_values.begin()); + + /* + DPCT1012:17: Detected kernel execution time measurement pattern and + generated an initial code for time measurements in SYCL. You can change the + way time is measured depending on your goals. + */ + /* + DPCT1024:18: The original code returned the error code that was further + consumed by the program logic. This original code was replaced with 0. You + may need to rewrite the program logic consuming the error code. + */ + stop_event_ct1 = std::chrono::steady_clock::now(); + checkCudaErrors(0); + checkCudaErrors(0); + + float time = 0; + checkCudaErrors( + DPCT_CHECK_ERROR((time = std::chrono::duration( + stop_event_ct1 - start_event_ct1) + .count()))); + totalTime += time; + } + + totalTime /= (1.0e3f * numIterations); + printf( + "radixSortThrust, Throughput = %.4f MElements/s, Time = %.5f s, Size = " + "%u elements\n", + 1.0e-6f * numElements / totalTime, totalTime, numElements); + + getLastCudaError("after radixsort"); + + // Get results back to host for correctness checking + std::copy( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end(), h_keysSorted.begin()); + + if (!keysOnly) + std::copy( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_values.begin(), d_values.end(), h_values.begin()); + + getLastCudaError("copying results to host memory"); + + // Check results + bool bTestResult = oneapi::dpl::is_sorted( + oneapi::dpl::execution::seq, h_keysSorted.begin(), h_keysSorted.end()); + + checkCudaErrors(DPCT_CHECK_ERROR(dpct::destroy_event(start_event))); + checkCudaErrors(DPCT_CHECK_ERROR(dpct::destroy_event(stop_event))); + + if (!bTestResult && !quiet) { + return false; + } + + return bTestResult; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +int main(int argc, char **argv) { + // Start logs + printf("%s Starting...\n\n", argv[0]); + + findCudaDevice(argc, (const char **)argv); + + bool bTestResult = false; + + if (checkCmdLineFlag(argc, (const char **)argv, "float")) + bTestResult = testSort(argc, argv); + else + bTestResult = testSort(argc, argv); + + printf(bTestResult ? "Test passed\n" : "Test failed!\n"); +} \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/CMakeLists.txt b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/CMakeLists.txt new file mode 100644 index 0000000000..1b0edab6d7 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/CMakeLists.txt @@ -0,0 +1,5 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -std=c++17") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lmkl_sycl -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core") + +include_directories(${CMAKE_SOURCE_DIR}/02_sycl_dpct_migrated/Common/) +add_subdirectory("src") \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h new file mode 100644 index 0000000000..bc9e302c52 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h @@ -0,0 +1,1022 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "helper_string.h" + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DPCT_HPP__ +static const char *_cudaGetErrorEnum(dpct::err0 error) { + /* + DPCT1009:0: SYCL uses exceptions to report errors and does not use the error + codes. The original code was commented out and a warning string was inserted. + You need to rewrite this code. + */ + return "cudaGetErrorName is not supported" /*cudaGetErrorName(error)*/; +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(int error) { + switch (error) { + case 0: + return "CURAND_STATUS_SUCCESS"; + + case 100: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case 101: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case 102: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case 103: + return "CURAND_STATUS_TYPE_ERROR"; + + case 104: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case 105: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case 106: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case 201: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case 202: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case 203: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case 204: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case 999: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { +} + +#ifdef __DPCT_HPP__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:1: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 err = 0; +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:3: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 err = 0; +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct dpct_type_624496 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {0x90, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct dpct_type_942342 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {0x87, "Ampere"}, + {0x90, "Hopper"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} + // end of GPU Architecture definitions + +#ifdef __DPCT_HPP__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:5: All SYCL devices can be used by the host to submit tasks. You may + need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors(DPCT_CHECK_ERROR( + major = dpct::dev_mgr::instance().get_device(devID).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version())); + /* + DPCT1035:6: All SYCL devices can be used by the host to submit tasks. You may + need to adjust this code. + */ + if (computeMode == 0) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + /* + DPCT1093:7: The "devID" device may be not the one intended for use. Adjust the + selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(devID))); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() try { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:8: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors(DPCT_CHECK_ERROR(major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR(minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version())); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + /* + DPCT1035:9: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + if (computeMode != 0) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(major, minor); + } + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors( + DPCT_CHECK_ERROR(multiProcessorCount = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_compute_units())); + dpct::err0 result = + DPCT_CHECK_ERROR(clockRate = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_clock_frequency()); + + uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + /* + DPCT1093:10: The "devID" device may be not the one intended for use. Adjust + the selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(devID))); + int major = 0, minor = 0; + checkCudaErrors(DPCT_CHECK_ERROR( + major = + dpct::dev_mgr::instance().get_device(devID).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = + dpct::dev_mgr::instance().get_device(devID).get_minor_version())); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, _ConvertSMVer2ArchName(major, minor), major, minor); + + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + checkCudaErrors(DPCT_CHECK_ERROR( + device_count = dpct::dev_mgr::instance().device_count())); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + /* + DPCT1035:11: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); + checkCudaErrors( + DPCT_CHECK_ERROR(integrated = dpct::dev_mgr::instance() + .get_device(current_device) + .get_integrated())); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + /* + DPCT1035:12: All SYCL devices can be used by the host to submit tasks. You + may need to adjust this code. + */ + if (integrated && (computeMode != 0)) { + /* + DPCT1093:13: The "current_device" device may be not the one intended for + use. Adjust the selected device if needed. + */ + checkCudaErrors(DPCT_CHECK_ERROR(dpct::select_device(current_device))); + + int major = 0, minor = 0; + checkCudaErrors(DPCT_CHECK_ERROR(major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR(minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version())); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(dev = dpct::dev_mgr::instance().current_device_id()); + checkCudaErrors(DPCT_CHECK_ERROR( + major = dpct::dev_mgr::instance().get_device(dev).get_major_version())); + checkCudaErrors(DPCT_CHECK_ERROR( + minor = dpct::dev_mgr::instance().get_device(dev).get_minor_version())); + + if ((major > major_version) || + (major == major_version && + minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_string.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_string.h new file mode 100644 index 0000000000..47fb1ac1fa --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_string.h @@ -0,0 +1,428 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/CMakeLists.txt b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/CMakeLists.txt new file mode 100644 index 0000000000..233bda86e2 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(radixSortMigrated radixSortMigrated.cpp) + +target_link_libraries(radixSortMigrated OpenCL sycl) + +add_custom_target(run_radixSortMigrated radixSortMigrated) \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/radixSortMigrated.cpp b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/radixSortMigrated.cpp new file mode 100644 index 0000000000..811abc2d4b --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/src/radixSortMigrated.cpp @@ -0,0 +1,239 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +template bool testSort(int argc, char **argv) try { + int cmdVal; + int keybits = 32; + + unsigned int numElements = 1048576; + bool keysOnly = checkCmdLineFlag(argc, (const char **)argv, "keysonly"); + bool quiet = checkCmdLineFlag(argc, (const char **)argv, "quiet"); + + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + numElements = cmdVal; + + if (cmdVal < 0) { + printf("Error: elements must be > 0, elements=%d is invalid\n", cmdVal); + exit(EXIT_SUCCESS); + } + } + + if (checkCmdLineFlag(argc, (const char **)argv, "keybits")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "keybits"); + keybits = cmdVal; + + if (keybits <= 0) { + printf("Error: keybits must be > 0, keybits=%d is invalid\n", keybits); + exit(EXIT_SUCCESS); + } + } + + unsigned int numIterations = (numElements >= 16777216) ? 10 : 100; + + if (checkCmdLineFlag(argc, (const char **)argv, "iterations")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "iterations"); + numIterations = cmdVal; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Command line:\nradixSortThrust [-option]\n"); + printf("Valid options:\n"); + printf("-n= : number of elements to sort\n"); + printf("-keybits=bits : keybits must be > 0\n"); + printf( + "-keysonly : only sort an array of keys (default sorts key-value " + "pairs)\n"); + printf( + "-float : use 32-bit float keys (default is 32-bit unsigned " + "int)\n"); + printf( + "-quiet : Output only the number of elements and the time to " + "sort\n"); + printf("-help : Output a help message\n"); + exit(EXIT_SUCCESS); + } + + if (!quiet) + printf("\nSorting %d %d-bit %s keys %s\n\n", numElements, keybits, + floatKeys ? "float" : "unsigned int", + keysOnly ? "(only)" : "and values"); + + int deviceID = -1; + deviceID = dpct::dev_mgr::instance().current_device_id(); + + if (0 == deviceID) { + dpct::device_info devprop; + dpct::dev_mgr::instance().get_device(deviceID).get_device_info(devprop); + unsigned int totalMem = (keysOnly ? 2 : 4) * numElements * sizeof(T); + + if (devprop.get_global_mem_size() < totalMem) { + printf("Error: insufficient amount of memory to sort %d elements.\n", + numElements); + printf("%d bytes needed, %d bytes available\n", (int)totalMem, + (int)devprop.get_global_mem_size()); + exit(EXIT_SUCCESS); + } + } + + std::vector h_keys(numElements); + std::vector h_keysSorted(numElements); + std::vector h_values; + + if (!keysOnly) h_values = std::vector(numElements); + + // Fill up with some random data + oneapi::dpl::minstd_rand rng(clock()); + + if (floatKeys) { + oneapi::dpl::uniform_real_distribution u01(0, 1); + + for (int i = 0; i < (int)numElements; i++) h_keys[i] = u01(rng); + } else { + oneapi::dpl::uniform_int_distribution u(0, UINT_MAX); + + for (int i = 0; i < (int)numElements; i++) h_keys[i] = u(rng); + } + + if (!keysOnly) + dpct::iota(oneapi::dpl::execution::seq, h_values.begin(), h_values.end()); + + // Copy data onto the GPU + dpct::device_vector d_keys; + dpct::device_vector d_values; + + // run multiple iterations to compute an average sort time + dpct::event_ptr start_event, stop_event; + std::chrono::time_point start_event_ct1; + std::chrono::time_point stop_event_ct1; + start_event = new sycl::event(); + stop_event = new sycl::event(); + + float totalTime = 0; + + for (unsigned int i = 0; i < numIterations; i++) { + // reset data before sort + d_keys = h_keys; + + if (!keysOnly) d_values = h_values; + + start_event_ct1 = std::chrono::steady_clock::now(); + + if (keysOnly) + oneapi::dpl::sort( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end()); + else + dpct::sort( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end(), d_values.begin()); + + stop_event_ct1 = std::chrono::steady_clock::now(); + + float time = 0; + checkCudaErrors( + DPCT_CHECK_ERROR((time = std::chrono::duration( + stop_event_ct1 - start_event_ct1) + .count()))); + totalTime += time; + } + + totalTime /= (1.0e3f * numIterations); + printf( + "radixSortThrust, Throughput = %.4f MElements/s, Time = %.5f s, Size = " + "%u elements\n", + 1.0e-6f * numElements / totalTime, totalTime, numElements); + + getLastCudaError("after radixsort"); + + // Get results back to host for correctness checking + std::copy( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_keys.begin(), d_keys.end(), h_keysSorted.begin()); + + if (!keysOnly) + std::copy( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + d_values.begin(), d_values.end(), h_values.begin()); + + getLastCudaError("copying results to host memory"); + + // Check results + bool bTestResult = oneapi::dpl::is_sorted( + oneapi::dpl::execution::seq, h_keysSorted.begin(), h_keysSorted.end()); + + dpct::destroy_event(start_event); + dpct::destroy_event(stop_event); + + if (!bTestResult && !quiet) { + return false; + } + + return bTestResult; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +int main(int argc, char **argv) { + time_t start, end; + time(&start); + // Start logs + printf("%s Starting...\n\n", argv[0]); + + findCudaDevice(argc, (const char **)argv); + + bool bTestResult = false; + + if (checkCmdLineFlag(argc, (const char **)argv, "float")) + bTestResult = testSort(argc, argv); + else + bTestResult = testSort(argc, argv); + + printf(bTestResult ? "Test passed\n" : "Test failed!\n"); + time(&end); + double time_taken = double(end - start); + std::cout << "Time taken by program is : " << std::fixed + << time_taken << std::setprecision(5); + std::cout << " sec " << std::endl; +} \ No newline at end of file diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/CMakeLists.txt b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/CMakeLists.txt new file mode 100644 index 0000000000..b1d80bd355 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required (VERSION 3.5) + +set(CMAKE_CXX_COMPILER "icpx") + +project (radixSortThrustMigrated) +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +add_subdirectory (02_sycl_dpct_migrated) From ade1941d2bfb9c874e2aec50ec46bfa5335d0685 Mon Sep 17 00:00:00 2001 From: iochocki Date: Wed, 13 Sep 2023 14:18:05 +0200 Subject: [PATCH 5/9] Adjust helper_cuda file --- .../Common/helper_cuda.h | 44 ------------------- 1 file changed, 44 deletions(-) diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h index bc9e302c52..ec9f3a4e47 100644 --- a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h @@ -601,10 +601,6 @@ void check(T result, char const *const func, const char *const file, inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) { - /* - DPCT1010:1: SYCL uses exceptions to report errors and does not use the error - codes. The call was replaced with 0. You need to rewrite this code. - */ dpct::err0 err = 0; } @@ -614,10 +610,6 @@ inline void __getLastCudaError(const char *errorMessage, const char *file, inline void __printLastCudaError(const char *errorMessage, const char *file, const int line) { - /* - DPCT1010:3: SYCL uses exceptions to report errors and does not use the error - codes. The call was replaced with 0. You need to rewrite this code. - */ dpct::err0 err = 0; } #endif @@ -761,19 +753,11 @@ inline int gpuDeviceInit(int devID) { } int computeMode = -1, major = 0, minor = 0; - /* - DPCT1035:5: All SYCL devices can be used by the host to submit tasks. You may - need to adjust this code. - */ checkCudaErrors(DPCT_CHECK_ERROR(computeMode = 1)); checkCudaErrors(DPCT_CHECK_ERROR( major = dpct::dev_mgr::instance().get_device(devID).get_major_version())); checkCudaErrors(DPCT_CHECK_ERROR( minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version())); - /* - DPCT1035:6: All SYCL devices can be used by the host to submit tasks. You may - need to adjust this code. - */ if (computeMode == 0) { fprintf(stderr, "Error: device is running in Date: Wed, 13 Sep 2023 14:21:51 +0200 Subject: [PATCH 6/9] Add intitial documentation --- .../radix_sort_thrust_migrated/License.txt | 7 + .../radix_sort_thrust_migrated/README.md | 178 ++++++++++++++++++ .../radix_sort_thrust_migrated/sample.json | 29 +++ 3 files changed, 214 insertions(+) create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/License.txt create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md create mode 100644 DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/License.txt b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/License.txt new file mode 100644 index 0000000000..80f3e07572 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md new file mode 100644 index 0000000000..5be873dbc6 --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md @@ -0,0 +1,178 @@ +# `cuRAND Migration` Sample + +The `cuRAND Migration` sample is a collection of code samples that demonstrate the cuBLAS equivalent in Intel® oneAPI Math Kernel Library (oneMKL). + +| Area | Description +|:--- |:--- +| What you will learn | How to begin migrating CUDA code to a SYCL*-compliant equivalent +| Time to complete | 30 minutes +| Category | Code Optimization + +For more information on oneMKL and complete documentation of all oneMKL routines, see https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-documentation.html. + +## Purpose + +The samples source code using SYCL were migrated from CUDA source code for offloading computations to a GPU/CPU. The sample demonstrates how to migrate code to SYCL, optimize the migration steps, and improve processing time. + +Each of the cuRAND samples source files shows the usage of different oneMKL cuRAND routines. All are basic programs containing the usage of a single method of generating pseudorandom numbers. + +>**Note**: This sample is based on the [*cuRAND Library - APIs Examples*](https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuRAND) samples in the NVIDIA/CUDALibrary GitHub repository. + +## Prerequisites + +| Optimized for | Description +|:--- |:--- +| OS | Ubuntu* 20.04 +| Hardware | 10th Gen Intel® processors or newer +| Software | Intel® oneAPI DPC++/C++ Compiler + +## Key Implementation Details + +This sample contains two sets of sources in the following folders: + +| Folder Name | Description +|:--- |:--- +| `01_sycl_dpct_output` | Contains output of Intel® DPC++ Compatibility Tool used to migrate SYCL-compliant code from CUDA code.
This SYCL code has some unmigrated or incorrectly generated code that has to be manually fixed before it is functional. (The code does not work as supplied.) +| `02_sycl_dpct_migrated` | Contains SYCL to CUDA migrated code generated by using the Intel® DPC++ Compatibility Tool with the manual changes implemented to make the code fully functional. + +These functions are classified into eight different directories, each based on a RNG engine. There are **48** samples: + +## Set Environment Variables + +When working with the command-line interface (CLI), you should configure the oneAPI toolkits using environment variables. Set up your CLI environment by sourcing the `setvars` script every time you open a new terminal window. This practice ensures that your compiler, libraries, and tools are ready for development. + +## Build the `cuRAND Migration` Sample + +> **Note**: If you have not already done so, set up your CLI +> environment by sourcing the `setvars` script in the root of your oneAPI installation. +> +> Linux*: +> - For system wide installations: `. /opt/intel/oneapi/setvars.sh` +> - For private installations: ` . ~/intel/oneapi/setvars.sh` +> - For non-POSIX shells, like csh, use the following command: `bash -c 'source /setvars.sh ; exec csh'` +> +> For more information on configuring environment variables, see *[Use the setvars Script with Linux* or macOS*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/oneapi-development-environment-setup/use-the-setvars-script-with-linux-or-macos.html)*. + +### On Linux* + +1. Change to the sample directory. +2. Build the samples. + ``` + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + + By default, this command sequence builds the version of the source code in the `02_sycl_dpct_migrated` folder. + +#### Troubleshooting + +If an error occurs, you can get more details by running `make` with +the `VERBOSE=1` argument: +``` +make VERBOSE=1 +``` +If you receive an error message, troubleshoot the problem using the **Diagnostics Utility for Intel® oneAPI Toolkits**. The diagnostic utility provides configuration and system checks to help find missing dependencies, permissions errors, and other issues. See the [Diagnostics Utility for Intel® oneAPI Toolkits User Guide](https://www.intel.com/content/www/us/en/develop/documentation/diagnostic-utility-user-guide/top.html) for more information on using the utility. + + +## Run the `cuRAND Migration` Sample + +### On Linux + +Run the programs on a CPU or GPU. Each sample uses a default device, which in most cases is a GPU. + +1. Run the samples in the `02_sycl_dpct_migrated` folder. + ``` + make run_mt19937_uniform + ``` + +### Build and Run the `cuRAND Migration` Sample in Intel® DevCloud (Optional) + +When running a sample in the Intel® DevCloud, you must specify the compute node (CPU, GPU, FPGA) and whether to run in batch or interactive mode. For more information, see the Intel® oneAPI Base Toolkit [Get Started Guide](https://devcloud.intel.com/oneapi/get_started/). + +#### Build and Run Samples in Batch Mode (Optional) + +You can submit build and run jobs through a Portable Bash Script (PBS). A job is a script that submitted to PBS through the `qsub` utility. By default, the `qsub` utility does not inherit the current environment variables or your current working directory, so you might need to submit jobs to configure the environment variables. To indicate the correct working directory, you can use either absolute paths or pass the `-d \` option to `qsub`. + +1. Open a terminal on a Linux* system. +2. Log in to Intel® DevCloud. + ``` + ssh devcloud + ``` +3. Download the samples. + ``` + git clone https://github.com/oneapi-src/oneAPI-samples.git + ``` +4. Change to the sample directory. +5. Configure the sample for a GPU node and choose the backend as OpenCL. + ``` + qsub -I -l nodes=1:gpu:ppn=2 -d . + export SYCL_DEVICE_FILTER=opencl:gpu + ``` + - `-I` (upper case I) requests an interactive session. + - `-l nodes=1:gpu:ppn=2` (lower case L) assigns one full GPU node. + - `-d .` makes the current folder as the working directory for the task. + + |Available Nodes |Command Options + |:--- |:--- + | GPU |`qsub -l nodes=1:gpu:ppn=2 -d .` + | CPU |`qsub -l nodes=1:xeon:ppn=2 -d .` + +6. Perform build steps as you would on Linux. +7. Run the programs. +8. Clean up the project files. + ``` + make clean + ``` +9. Disconnect from the Intel® DevCloud. + ``` + exit + ``` + +## Example Output + +This is example output if you built the default and ran `run_mt19937_uniform`. + +``` +Scanning dependencies of target mt19937_uniform +[ 50%] Building CXX object 02_sycl_dpct_migrated/mt19937/CMakeFiles/mt19937_uniform.dir/mt19937_uniform.cpp.o +[100%] Linking CXX executable ../../bin/mt19937_uniform +[100%] Built target mt19937_uniform +Host +0.966454 +0.778166 +0.440733 +0.116851 +0.007491 +0.090644 +0.910976 +0.942535 +0.939269 +0.807002 +0.582228 +0.034926 +===== +Device +0.966454 +0.778166 +0.440733 +0.116851 +0.007491 +0.090644 +0.910976 +0.942535 +0.939269 +0.807002 +0.582228 +0.034926 +===== +[100%] Built target run_mt19937_uniform +``` + +## License + +Code samples are licensed under the MIT license. See +[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details. + +Third party program licenses are at [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt). diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json new file mode 100644 index 0000000000..a43dd2bace --- /dev/null +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "E3626FAB-DCD8-465F-A4E7-BF4A858D6583", + "name": "Jacobi Iterative Solver", + "categories": ["Toolkit/oneAPI Direct Programming/C++SYCL/Dense Linear Algebra"], + "description": "Jacobi Iterative Solver provides step by step instructions for CPU, GPU and multiple GPU offload", + "toolchain": ["dpcpp"], + "os": ["linux"], + "targetDevice": ["CPU", "GPU"], + "gpuRequired": ["gen11"], + "builder": ["cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run_1_cpu", + "make run_2_gpu", + "make run_3_multi_gpu" + ] + } + ] + }, + "expertise": "Code Optimization" + } + \ No newline at end of file From 55014d4b3853855e2d65fb85acba5ede74c422e7 Mon Sep 17 00:00:00 2001 From: iochocki Date: Tue, 24 Oct 2023 15:31:48 +0200 Subject: [PATCH 7/9] Modify json --- .../radix_sort_thrust_migrated/sample.json | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json index a43dd2bace..eaaa16384f 100644 --- a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/sample.json @@ -1,8 +1,8 @@ { - "guid": "E3626FAB-DCD8-465F-A4E7-BF4A858D6583", - "name": "Jacobi Iterative Solver", - "categories": ["Toolkit/oneAPI Direct Programming/C++SYCL/Dense Linear Algebra"], - "description": "Jacobi Iterative Solver provides step by step instructions for CPU, GPU and multiple GPU offload", + "guid": "DA09E38F-2DCC-40C9-B312-588F59AF9A60", + "name": "Radix Sort Thrust Migrated", + "categories": ["Toolkit/oneAPI Direct Programming/C++SYCL/Graph Traversal"], + "description": "Radix Sort Thrust Migrated demonstrates a fast and efficient parallel radix sort that uses the SYCL version of the THRUST library", "toolchain": ["dpcpp"], "os": ["linux"], "targetDevice": ["CPU", "GPU"], @@ -17,9 +17,7 @@ "cd build", "cmake ..", "make", - "make run_1_cpu", - "make run_2_gpu", - "make run_3_multi_gpu" + "make run_radixSortMigrated" ] } ] From 28445de24b4de18d62d4339e4d619de8bbe70939 Mon Sep 17 00:00:00 2001 From: IgorOchocki <36711066+IgorOchocki@users.noreply.github.com> Date: Tue, 24 Oct 2023 16:48:28 +0200 Subject: [PATCH 8/9] Update README.md --- .../radix_sort_thrust_migrated/README.md | 62 +++++++------------ 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md index 5be873dbc6..af3e7ddf5f 100644 --- a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/README.md @@ -1,6 +1,6 @@ -# `cuRAND Migration` Sample +# `Radix Sort Thrust Migrated` Sample -The `cuRAND Migration` sample is a collection of code samples that demonstrate the cuBLAS equivalent in Intel® oneAPI Math Kernel Library (oneMKL). +The `Radix Sort Thrust Migrated` sample is a CUDA to SYCL migrated sample that shows the THRUST equivalent in Intel® oneAPI. | Area | Description |:--- |:--- @@ -14,9 +14,7 @@ For more information on oneMKL and complete documentation of all oneMKL routines The samples source code using SYCL were migrated from CUDA source code for offloading computations to a GPU/CPU. The sample demonstrates how to migrate code to SYCL, optimize the migration steps, and improve processing time. -Each of the cuRAND samples source files shows the usage of different oneMKL cuRAND routines. All are basic programs containing the usage of a single method of generating pseudorandom numbers. - ->**Note**: This sample is based on the [*cuRAND Library - APIs Examples*](https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuRAND) samples in the NVIDIA/CUDALibrary GitHub repository. +>**Note**: This sample is based on the [*radixSortThrust*](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/2_Concepts_and_Techniques/radixSortThrust) sample in the NVIDIA/cuda-samples GitHub repository. ## Prerequisites @@ -76,7 +74,7 @@ make VERBOSE=1 If you receive an error message, troubleshoot the problem using the **Diagnostics Utility for Intel® oneAPI Toolkits**. The diagnostic utility provides configuration and system checks to help find missing dependencies, permissions errors, and other issues. See the [Diagnostics Utility for Intel® oneAPI Toolkits User Guide](https://www.intel.com/content/www/us/en/develop/documentation/diagnostic-utility-user-guide/top.html) for more information on using the utility. -## Run the `cuRAND Migration` Sample +## Run the `Radix Sort Thrust Migrated` Sample ### On Linux @@ -84,10 +82,10 @@ Run the programs on a CPU or GPU. Each sample uses a default device, which in mo 1. Run the samples in the `02_sycl_dpct_migrated` folder. ``` - make run_mt19937_uniform + make run_radixSortMigrated ``` -### Build and Run the `cuRAND Migration` Sample in Intel® DevCloud (Optional) +### Build and Run the `Radix Sort Thrust Migrated` Sample in Intel® DevCloud (Optional) When running a sample in the Intel® DevCloud, you must specify the compute node (CPU, GPU, FPGA) and whether to run in batch or interactive mode. For more information, see the Intel® oneAPI Base Toolkit [Get Started Guide](https://devcloud.intel.com/oneapi/get_started/). @@ -132,42 +130,24 @@ You can submit build and run jobs through a Portable Bash Script (PBS). A job is ## Example Output -This is example output if you built the default and ran `run_mt19937_uniform`. +This is example output if you built the default and ran `run_radixSortMigrated`. ``` Scanning dependencies of target mt19937_uniform -[ 50%] Building CXX object 02_sycl_dpct_migrated/mt19937/CMakeFiles/mt19937_uniform.dir/mt19937_uniform.cpp.o -[100%] Linking CXX executable ../../bin/mt19937_uniform -[100%] Built target mt19937_uniform -Host -0.966454 -0.778166 -0.440733 -0.116851 -0.007491 -0.090644 -0.910976 -0.942535 -0.939269 -0.807002 -0.582228 -0.034926 -===== -Device -0.966454 -0.778166 -0.440733 -0.116851 -0.007491 -0.090644 -0.910976 -0.942535 -0.939269 -0.807002 -0.582228 -0.034926 -===== -[100%] Built target run_mt19937_uniform +[ 50%] Building CXX object 02_sycl_dpct_migrated/src/CMakeFiles/radixSortMigrated.dir/radixSortMigrated.cpp.o +[100%] Linking CXX executable ../../bin/radixSortMigrated +[100%] Built target radixSortMigrated +Scanning dependencies of target run_radixSortMigrated +../../bin/radixSortMigrated Starting... + + +Sorting 1048576 32-bit unsigned int keys and values + +radixSortThrust, Throughput = 5.5147 MElements/s, Time = 0.19014 s, Size = 1048576 elements +Test passed +Time taken by program is : 292.000000 sec +[100%] Built target run_radixSortMigrated + ``` ## License From c8090f5744ac2fd45462d0cca2246f06bc172a70 Mon Sep 17 00:00:00 2001 From: iochocki Date: Tue, 24 Oct 2023 16:49:02 +0200 Subject: [PATCH 9/9] Modify helper file --- .../02_sycl_dpct_migrated/Common/helper_cuda.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h index ec9f3a4e47..9ac79648c8 100644 --- a/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h +++ b/DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/02_sycl_dpct_migrated/Common/helper_cuda.h @@ -666,10 +666,6 @@ inline int _ConvertSMVer2Cores(int major, int minor) { // If we don't find the values, we default use the previous one // to run properly - printf( - "MapSMtoCores for SM %d.%d is undefined." - " Default to use %d Cores/SM\n", - major, minor, nGpuArchCoresPerSM[index - 1].Cores); return nGpuArchCoresPerSM[index - 1].Cores; } @@ -714,10 +710,6 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) { // If we don't find the values, we default use the previous one // to run properly - printf( - "MapSMtoArchName for SM %d.%d is undefined." - " Default to use %s\n", - major, minor, nGpuArchNameSM[index - 1].name); return nGpuArchNameSM[index - 1].name; } // end of GPU Architecture definitions @@ -884,8 +876,6 @@ inline int findCudaDevice(int argc, const char **argv) { checkCudaErrors(DPCT_CHECK_ERROR( minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version())); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - devID, _ConvertSMVer2ArchName(major, minor), major, minor); } @@ -925,9 +915,6 @@ inline int findIntegratedGPU() { checkCudaErrors(DPCT_CHECK_ERROR(minor = dpct::dev_mgr::instance() .get_device(current_device) .get_minor_version())); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - current_device, _ConvertSMVer2ArchName(major, minor), major, minor); - return current_device; } else { devices_prohibited++;