diff --git a/DirectProgramming/C++SYCL/Jupyter/oneapi-essentials-training/02_SYCL_Program_Structure/SYCL_Program_Structure.ipynb b/DirectProgramming/C++SYCL/Jupyter/oneapi-essentials-training/02_SYCL_Program_Structure/SYCL_Program_Structure.ipynb old mode 100644 new mode 100755 index 29e105df30..ab3a8e2262 --- a/DirectProgramming/C++SYCL/Jupyter/oneapi-essentials-training/02_SYCL_Program_Structure/SYCL_Program_Structure.ipynb +++ b/DirectProgramming/C++SYCL/Jupyter/oneapi-essentials-training/02_SYCL_Program_Structure/SYCL_Program_Structure.ipynb @@ -14,14 +14,23 @@ "metadata": {}, "source": [ "##### Sections\n", - "- [What is Data Parallel C++ and SYCL?](#What-is-Data-Parallel-C++-and-SYCL?)\n", - "- _Code:_ [Device Selector](#Device-Selector)\n", - "- [Data Parallel Kernels](#Parallel-Kernels)\n", - "- [SYCL Code Anatomy](#SYCL-Code-Anatomy)\n", - "- _Code:_ [Implicit dependency with Accessors](#Implicit-dependency-with-Accessors)\n", - "- _Code:_ [Synchronization: Host Accessor](#Synchronization:-Host-Accessor)\n", - "- _Code:_ [Synchronization: Buffer Destruction](#Synchronization:-Buffer-Destruction)\n", + "- [What is SYCL?](#What-is-SYCL?)\n", + "- [SYCL Classes](#SYCL-Classes)\n", + " - [Device](#Device)\n", + " - _Code:_ [Device Selector](#Device-Selector)\n", + " - [Queue](#Queue)\n", + " - [Kernel](#Kernel)\n", + "- [Parallel Kernels](#Parallel-Kernels)\n", + " - [Basic Parallel Kernels](#Basic-Parallel-Kernels)\n", + " - [ND-Range Kernels](#ND-Range-Kernels)\n", + "- [Memory Models](#Memory-Models)\n", + " - _Code:_ [Vector Add implementation using USM and Buffers](#Vector-Add-implementation-using-USM-and-Buffers)\n", + " - [Unified Shared Memory Model](#Unified-Shared-Memory-Model)\n", + " - [Buffer Memory Model](#Buffer-Memory-Model)\n", + " - _Code:_ [Synchronization: Host Accessor](#Synchronization:-Host-Accessor)\n", + " - _Code:_ [Synchronization: Buffer Destruction](#Synchronization:-Buffer-Destruction)\n", "- _Code:_ [Custom Device Selector](#Custom-Device-Selector)\n", + "- [Multi-GPU Selection](#Multi-GPU-Selection)\n", "- _Code:_ [Complex Number Multiplication](#Code-Sample:-Complex-Number-Multiplication)\n", "- _Lab Exercise:_ [Vector Add](#Lab-Exercise:-Vector-Add)" ] @@ -34,23 +43,85 @@ "* Explain the __SYCL__ fundamental classes\n", "* Use __device selection__ to offload kernel workloads\n", "* Decide when to use __basic parallel kernels__ and __ND Range Kernels__\n", - "* Create a __host Accessor__\n", + "* Use __Unified Shared Memory__ or __Buffer-Accessor__ memory model in SYCL program\n", "* Build a sample __SYCL application__ through hands-on lab exercises" ] }, + { + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "## What is SYCL?\n", + "_SYCL__ is an open standard to program for heterogeneous devicee in a single source. A SYCL program is invoked on the host computer and offloads the computation to an accelerator. Programmers use familiar C++ and library constructs with added functionalities like a __queue__ for work targeting, __buffer__ or __Unified Shared Memory__ for data management, and __parallel_for__ for parallelism to direct which parts of the computation and data should be offloaded." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SYCL Language and Runtime\n", + "SYCL language and runtime consists of a set of C++ classes, templates, and libraries.\n", + "\n", + " __Application scope__ and __command group scope__:\n", + " * Code that executes on the host\n", + " * The full capabilities of C++ are available at application and command group scope \n", + "\n", + "__Kernel__ scope:\n", + " * Code that executes on the device. \n", + " * At __kernel__ scope there are __limitations__ in accepted C++\n", + "\n", + "\n", + "\n", + "#### C++ SYCL Code Example\n", + "Let's look at a simple SYCL code to offload computation to GPU, the code does the following:\n", + "1. selects GPU device for offload\n", + "2. allocates memory that can be accessed on host and GPU\n", + "3. initializes data array on host\n", + "4. offloads computation to GPU\n", + "5. prints output on host\n", + "\n", + "\n", + "```cpp\n", + "#include \n", + "static const int N = 16;\n", + "int main(){\n", + " sycl::queue q(sycl::gpu_device_selector_v); // <--- select GPU for offload \n", + "\n", + " int *data = sycl::malloc_shared(N, q); // <--- allocate memory\n", + "\n", + " for(int i=0; i" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## SYCL Language and Runtime\n", - "SYCL language and runtime consists of a set of C++ classes, templates, and libraries.\n", - "\n", - " __Application scope__ and __command group scope__:\n", - " * Code that executes on the host\n", - " * The full capabilities of C++ are available at application and command group scope \n", - "\n", - "__Kernel__ scope:\n", - " * Code that executes on the device. \n", - " * At __kernel__ scope there are __limitations__ in accepted C++\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -226,7 +283,7 @@ "Below is how you can offload to accelerator\n", "\n", "```cpp\n", - "h.parallel_for(range<1>(1024), [=](id<1> i){\n", + "q.parallel_for(range<1>(1024), [=](id<1> i){\n", "    A[i] =  B[i] + C[i];\n", "});\n", "```\n" @@ -236,13 +293,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Basic Parallel Kernels\n", + "### Basic Parallel Kernels\n", "\n", "The functionality of basic parallel kernels is exposed via __range__, __id__, and __item__ classes. __Range__ class is used to describe the __iteration space__ of parallel execution and __id__ class is used to __index__ an individual instance of a kernel in a parallel execution\n", "\n", "\n", "```cpp\n", - "h.parallel_for(range<1>(1024), [=](id<1> i){\n", + "q.parallel_for(range<1>(1024), [=](id<1> i){\n", "// CODE THAT RUNS ON DEVICE \n", "});\n", "\n", @@ -251,7 +308,7 @@ "\n", "\n", "```cpp\n", - "h.parallel_for(range<1>(1024), [=](item<1> item){\n", + "q.parallel_for(range<1>(1024), [=](item<1> item){\n", " auto i = item.get_id();\n", " auto R = item.get_range();\n", " // CODE THAT RUNS ON DEVICE\n", @@ -266,13 +323,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## ND RANGE KERNELS\n", + "### ND-Range Kernels\n", "Basic Parallel Kernels are easy way to parallelize a for-loop but does not allow performance optimization at hardware level. __ND-Range kernel__ is another way to expresses parallelism which enable low level performance tuning by providing access to __local memory and mapping executions__ to compute units on hardware. The entire iteration space is divided into smaller groups called __work-groups__, __work-items__ within a work-group are scheduled on a single compute unit on hardware.\n", "\n", "The grouping of kernel executions into work-groups will allow control of resource usage and load balance work distribution.The functionality of nd_range kernels is exposed via __nd_range__ and __nd_item__ classes. __nd_range__ class represents a __grouped execution range__ using global execution range and the local execution range of each work-group. __nd_item__ class represents an __individual instance__ of a kernel function and allows to query for work-group range and index.\n", "\n", "```cpp\n", - "h.parallel_for(nd_range<1>(range<1>(1024),range<1>(64)), [=](nd_item<1> item){\n", + "q.parallel_for(nd_range<1>(range<1>(1024),range<1>(64)), [=](nd_item<1> item){\n", "    auto idx = item.get_global_id();\n", "    auto local_id = item.get_local_id();\n", "    // CODE THAT RUNS ON DEVICE\n", @@ -285,80 +342,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Buffer Model\n", - "__Buffers encapsulate__ data in a SYCL application across both devices and host. __Accessors__ is the mechanism to access buffer data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SYCL Code Anatomy\n", - "Programs which utilize oneAPI require the include of __cl/sycl.hpp__. It is recommended to employ the namespace statement to save typing repeated references into the cl::sycl namespace.\n", - "\n", - "```cpp\n", - "#include \n", - "using namespace cl::sycl;\n", - "```\n", - "\n", - "__SYCL programs__ are standard C++. The program is invoked on the __host__ computer, and offloads computation to the __accelerator__. A programmer uses SYCL’s __queue, buffer, device, and kernel abstractions__ to direct which parts of the computation and data should be offloaded.\n", - "\n", - "As a first step in a SYCL program we create a __queue__. We offload computation to a __device__ by submitting tasks to a queue. The programmer can choose CPU, GPU, FPGA, and other devices through the __selector__. This program uses the default q here, which means SYCL runtime selects the most capable device available at runtime by using the default selector. We will talk about the devices, device selectors, and the concepts of buffers, accessors and kernels in the upcoming modules but below is a simple SYCL program for you to get started with the above concepts.\n", - "\n", - "Device and host can either share physical __memory__ or have distinct memories. When the memories are distinct, offloading computation requires __copying data between host and device__. SYCL does not require the programmer to manage the data copies. By creating __Buffers and Accessors__, SYCL ensures that the data is available to host and device without any programmer effort. SYCL also allows the programmer explicit control over data movement when it is necessary to achieve best peformance.\n", - "\n", - "In a SYCL program, we define a __kernel__, which is applied to every point in an index space. For simple programs like this one, the index space maps directly to the elements of the array. The kernel is encapsulated in a __C++ lambda function__. The lambda function is passed a point in the index space as an array of coordinates. For this simple program, the index space coordinate is the same as the array index. The __parallel_for__ in the below program applies the lambda to the index space. The index space is defined in the first argument of the parallel_for as a 1 dimensional __range from 0 to N-1__.\n", - "\n", - "\n", - "The code below shows Simple Vector addition using SYCL. Read through the comments addressed in step 1 through step 6.\n", - "\n", - "```cpp\n", - "void SYCL_code(int* a, int* b, int* c, int N) {\n", - "  //Step 1: create a device queue\n", - " //(developer can specify a device type via device selector or use default selector)\n", - " auto R = range<1>(N);\n", - "  queue q;\n", - "  //Step 2: create buffers (represent both host and device memory)\n", - "  buffer buf_a(a, R);\n", - "  buffer buf_b(b, R);\n", - "  buffer buf_c(c, R);\n", - "  //Step 3: submit a command for (asynchronous) execution\n", - "  q.submit([&](handler &h){\n", - "  //Step 4: create buffer accessors to access buffer data on the device\n", - " accessor A(buf_a,h,read_only);\n", - " accessor B(buf_b,h,read_only);\n", - " accessor C(buf_c,h,write_only);\n", - "  \n", - "  //Step 5: send a kernel (lambda) for execution\n", - "  h.parallel_for(range<1>(N), [=](auto i){\n", - " //Step 6: write a kernel\n", - " //Kernel invocations are executed in parallel\n", - " //Kernel is invoked for each element of the range\n", - " //Kernel invocation has access to the invocation id\n", - "    C[i] = A[i] + B[i];\n", - "    });\n", - "  });\n", - "}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "## Memory Models\n", "\n", - "## Implicit dependency with Accessors\n", - "* Accessors create __data dependencies__ in the SYCL graph that order kernel executions\n", - "* If two kernels use the same buffer, the second kernel needs to wait for the completion of the first kernel to avoid race conditions. \n", + "A SYCL application can be written using one of the 2 memory models:\n", + "- Unified Shared Memory Model (USM)\n", + "- Buffer Memory Model\n", "\n", + "__Unified Shared Memory__ Model is pointer-based approach to memory model, similar to C/C++ pointer-based memory allocation. Makes migrating C/C++/CUDA* application to SYCL easier. Dependencies between multiple kernels are explicitly handled using events.\n", "\n", - "\n", + "__Buffer Memory Model__ allows a new memory abstraction called buffers and are accessed using accessors which allows setting read/write permissions and other properties to memory. Allows data representation in 1,2 or 3-dimentions and makes programming kernels with 2/3-dimentional data easier. Dependencies between multiple kernels are implicitly handled.\n", "\n", "\n", - "The SYCL code below demonstrates Implicit dependency with Accessors: Inspect code, there are no modifications necessary:\n", - "\n", + "#### Vector Add implementation using USM and Buffers\n", + "The SYCL code below shows vector add computation implemented using USM and Buffers memory model: Inspect code, there are no modifications necessary:\n", "1. Inspect the code cell below and click run ▶ to save the code to file\n", - "\n", "2. Next run ▶ the cell in the __Build and Run__ section below the code to compile and execute the code." ] }, @@ -368,7 +365,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%writefile lab/buffer_sample.cpp\n", + "%%writefile lab/vector_add_usm_buffers.cpp\n", "//==============================================================\n", "// Copyright © Intel Corporation\n", "//\n", @@ -376,52 +373,84 @@ "// =============================================================\n", "#include \n", "\n", - "constexpr int num=16;\n", "using namespace sycl;\n", "\n", - " int main() {\n", - " auto R = range<1>{ num };\n", - " //Create Buffers A and B\n", - " buffer A{ R }, B{ R };\n", - " //Create a device queue\n", - " queue Q;\n", - " //Submit Kernel 1\n", - " Q.submit([&](handler& h) {\n", - " //Accessor for buffer A\n", - " accessor out(A,h,write_only);\n", - " h.parallel_for(R, [=](auto idx) {\n", - " out[idx] = idx[0]; }); });\n", - " //Submit Kernel 2\n", - " Q.submit([&](handler& h) {\n", - " //This task will wait till the first queue is complete\n", - " accessor out(A,h,write_only);\n", - " h.parallel_for(R, [=](auto idx) {\n", - " out[idx] += idx[0]; }); });\n", - " //Submit Kernel 3\n", - " Q.submit([&](handler& h) { \n", - " //Accessor for Buffer B\n", - " accessor out(B,h,write_only);\n", - " h.parallel_for(R, [=](auto idx) {\n", - " out[idx] = idx[0]; }); });\n", - " //Submit task 4\n", - " Q.submit([&](handler& h) {\n", - " //This task will wait till kernel 2 and 3 are complete\n", - " accessor in (A,h,read_only);\n", - " accessor inout(B,h);\n", - " h.parallel_for(R, [=](auto idx) {\n", - " inout[idx] *= in[idx]; }); }); \n", - " \n", - " // And the following is back to device code\n", - " host_accessor result(B,read_only);\n", - " for (int i=0; i(N, q); \n", + " auto b_device = malloc_device(N, q); \n", + " auto c_device = malloc_device(N, q); \n", + " //Step 3: copy memory from host to device\n", + " q.memcpy(a_device, a, N*sizeof(int));\n", + " q.memcpy(b_device, b, N*sizeof(int));\n", + " q.wait();\n", + " //Step 4: send a kernel (lambda) for execution\n", + " q.parallel_for(N, [=](auto i){\n", + " //Step 5: write a kernel\n", + " c_device[i] = a_device[i] + b_device[i];\n", + " }).wait();\n", + " //Step 6: copy the result back to host\n", + " q.memcpy(c, c_device, N*sizeof(int)).wait();\n", + " //Step 7: free device allocation\n", + " free(a_device, q);\n", + " free(b_device, q);\n", + " free(c_device, q);\n", + "}\n", + "\n", + "// kernel function to compute vector add using Buffer memory model\n", + "void kernel_buffers(int* a, int* b, int* c, int N) {\n", + " //Step 1: create a device queue\n", + " queue q;\n", + " //Step 2: create buffers \n", + " buffer buf_a(a, range<1>(N));\n", + " buffer buf_b(b, range<1>(N));\n", + " buffer buf_c(c, range<1>(N));\n", + " //Step 3: submit a command for (asynchronous) execution\n", + " q.submit([&](handler &h){\n", + " //Step 4: create buffer accessors to access buffer data on the device\n", + " accessor A(buf_a, h, read_only);\n", + " accessor B(buf_b, h, read_only);\n", + " accessor C(buf_c, h, write_only);\n", + " //Step 5: send a kernel (lambda) for execution\n", + " h.parallel_for(N, [=](auto i){\n", + " //Step 6: write a kernel\n", + " C[i] = A[i] + B[i];\n", + " });\n", + " });\n", + "}\n", + "\n", + "int main() {\n", + " // initialize data arrays on host\n", + " constexpr int N = 256;\n", + " int a[N], b[N], c[N];\n", + " for (int i=0; i\n", + "using namespace sycl;\n", + "```\n", + "\n", + "__SYCL programs__ are standard C++. The program is invoked on the __host__ computer, and offloads computation to the __accelerator__. A programmer uses SYCL’s __queue and kernel abstractions__ to direct which parts of the computation and data should be offloaded.\n", + "\n", + "As a first step in a SYCL program we create a __queue__. We offload computation to a __device__ by submitting tasks to a queue. The programmer can choose CPU, GPU, FPGA, and other devices through the __selector__. This program uses the default q here, which means SYCL runtime selects the most capable device available at runtime by using the default selector.\n", + "\n", + "Device and host can either share physical __memory__ or have distinct memories. When the memories are distinct, offloading computation requires __copying data between host and device__. We use USM device allocation `malloc_device` to allocate memory on device and copy data between host and device using `memcpy` method.\n", + "\n", + "In a SYCL program, we define a __kernel__, which is applied to every point in an index space. For simple programs like this one, the index space maps directly to the elements of the array. The kernel is encapsulated in a __C++ lambda function__. The lambda function is passed a point in the index space as an array of coordinates. For this simple program, the index space coordinate is the same as the array index. The __parallel_for__ in the below program applies the lambda to the index space. The index space is defined in the first argument of the parallel_for as a 1 dimensional __range from 0 to N-1__.\n", + "\n", + "The code below shows Simple Vector addition using SYCL. Read through the comments addressed in step 1 through step 6.\n", + "\n", + "```cpp\n", + "void SYCL_code(int* a, int* b, int* c, int N) {\n", + "  //Step 1: create a device queue\n", + " //(developer can specify a device type via device selector or use default selector)\n", + "  queue q;\n", + "  //Step 2: create USM device allocation\n", + "  auto a_device = malloc_device(N, q); \n", + " auto b_device = malloc_device(N, q); \n", + " auto c_device = malloc_device(N, q); \n", + "  //Step 3: copy memory from host to device\n", + " q.memcpy(a_device, a, N*sizeof(int));\n", + " q.memcpy(b_device, b, N*sizeof(int));\n", + " q.wait();\n", + "  //Step 4: send a kernel (lambda) for execution\n", + "  q.parallel_for(N, [=](auto i){\n", + " //Step 5: write a kernel\n", + " //Kernel invocations are executed in parallel\n", + " //Kernel is invoked for each element of the range\n", + " //Kernel invocation has access to the invocation id\n", + "    c_device[i] = a_device[i] + b_device[i];\n", + "  }).wait();\n", + " //Step 6: copy the result back to host\n", + " q.memcpy(c, c_device, N*sizeof(int)).wait();\n", + "}\n", + "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Host Accessors\n", - "The Host Accessor is an accessor which uses host buffer access target. It is created outside of the scope of the command group and the data that this gives access to will be available on the host. These are used to synchronize the data back to the host by constructing the host accessor objects. Buffer destruction is the other way to synchronize the data back to the host.\n" + "### Buffer Memory Model\n", + "__Buffers encapsulate__ data in a SYCL application across both devices and host. __Accessors__ is the mechanism to access buffer data." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Synchronization: Host Accessor\n", + "#### SYCL Code Anatomy - Buffer Model\n", + "Programs which utilize oneAPI require the include of __sycl/sycl.hpp__. It is recommended to employ the namespace statement to save typing repeated references into the sycl namespace.\n", + "\n", + "```cpp\n", + "#include \n", + "using namespace sycl;\n", + "```\n", + "\n", + "__SYCL programs__ are standard C++. The program is invoked on the __host__ computer, and offloads computation to the __accelerator__. A programmer uses SYCL’s __queue, buffer, device, and kernel abstractions__ to direct which parts of the computation and data should be offloaded.\n", + "\n", + "As a first step in a SYCL program we create a __queue__. We offload computation to a __device__ by submitting tasks to a queue. The programmer can choose CPU, GPU, FPGA, and other devices through the __selector__. This program uses the default q here, which means SYCL runtime selects the most capable device available at runtime by using the default selector. We will talk about the devices, device selectors, and the concepts of buffers, accessors and kernels in the upcoming modules but below is a simple SYCL program for you to get started with the above concepts.\n", + "\n", + "Device and host can either share physical __memory__ or have distinct memories. When the memories are distinct, offloading computation requires __copying data between host and device__. SYCL does not require the programmer to manage the data copies. By creating __Buffers and Accessors__, SYCL ensures that the data is available to host and device without any programmer effort. SYCL also allows the programmer explicit control over data movement when it is necessary to achieve best peformance.\n", + "\n", + "In a SYCL program, we define a __kernel__, which is applied to every point in an index space. For simple programs like this one, the index space maps directly to the elements of the array. The kernel is encapsulated in a __C++ lambda function__. The lambda function is passed a point in the index space as an array of coordinates. For this simple program, the index space coordinate is the same as the array index. The __parallel_for__ in the below program applies the lambda to the index space. The index space is defined in the first argument of the parallel_for as a 1 dimensional __range from 0 to N-1__.\n", + "\n", + "\n", + "The code below shows Simple Vector addition using SYCL. Read through the comments addressed in step 1 through step 6.\n", + "\n", + "```cpp\n", + "void SYCL_code(int* a, int* b, int* c, int N) {\n", + "  //Step 1: create a device queue\n", + " //(developer can specify a device type via device selector or use default selector)\n", + "  queue q;\n", + "  //Step 2: create buffers (represent both host and device memory)\n", + "  buffer buf_a(a, range<1>(N));\n", + "  buffer buf_b(b, range<1>(N));\n", + "  buffer buf_c(c, range<1>(N));\n", + "  //Step 3: submit a command for (asynchronous) execution\n", + "  q.submit([&](handler &h){\n", + "   //Step 4: create buffer accessors to access buffer data on the device\n", + " accessor A(buf_a,h,read_only);\n", + " accessor B(buf_b,h,read_only);\n", + " accessor C(buf_c,h,write_only);\n", + "  \n", + "   //Step 5: send a kernel (lambda) for execution\n", + "   h.parallel_for(N, [=](auto i){\n", + " //Step 6: write a kernel\n", + " //Kernel invocations are executed in parallel\n", + " //Kernel is invoked for each element of the range\n", + " //Kernel invocation has access to the invocation id\n", + "     C[i] = A[i] + B[i];\n", + "    });\n", + "  });\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Synchronization: Host Accessor\n", + "\n", + "The Host Accessor is an accessor which uses host buffer access target. It is created outside of the scope of the command group and the data that this gives access to will be available on the host. These are used to synchronize the data back to the host by constructing the host accessor objects. Buffer destruction is the other way to synchronize the data back to the host.\n", "\n", "Buffer takes ownership of the data stored in vector. Creating host accessor is a __blocking call__ and will only return after all enqueued SYCL kernels that modify the same buffer in any queue completes execution and the data is available to the host via this host accessor.\n", "\n", @@ -500,7 +638,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Build and Run\n", + "#### Build and Run\n", "Select the cell below and click run ▶ to compile and execute the code:" ] }, @@ -517,7 +655,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Synchronization: Buffer Destruction\n", + "#### Synchronization: Buffer Destruction\n", "In the below example Buffer creation happens within a separate function scope. When execution advances beyond this __function scope__, buffer destructor is invoked which relinquishes the ownership of data and copies back the data to the host memory.\n", "\n", "The SYCL code below demonstrates Synchronization with Buffer Destruction: Inspect code, there are no modifications necessary:\n", @@ -569,7 +707,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Build and Run\n", + "#### Build and Run\n", "Select the cell below and click run ▶ to compile and execute the code:" ] }, @@ -699,6 +837,57 @@ "! chmod 755 q; chmod 755 run_custom_device.sh;if [ -x \"$(command -v qsub)\" ]; then ./q run_custom_device.sh; else ./run_custom_device.sh; fi" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multi-GPU Selection\n", + "\n", + "To submit job to a single GPU, we use `sycl::device` class with `sycl::gpu_selector_v` to find GPU device on the system and then create `sycl::queue` with this device as shown below:\n", + "\n", + "```cpp\n", + "auto gpu = sycl::device(sycl::gpu_selector_v);\n", + "\n", + "sycl::queue q(gpu);\n", + "```\n", + "\n", + "To find multiple GPU device in the system, `sycl::platform` class is used to query all devices in a system, `sycl::gpu_selector_v` is used to filter only GPU devices, the `get_devices()` method will create a vector of GPU devices found.\n", + "\n", + "```cpp\n", + "auto gpus = sycl::platform(sycl::gpu_selector_v).get_devices();\n", + "\n", + "sycl::queue q_gpu1(gpus[0]);\n", + "sycl::queue q_gpu2(gpus[1]);\n", + "```\n", + "\n", + "Once we have found all the GPU devices, we create `sycl::queue` for each GPU device and submit job for GPU devices.\n", + "\n", + "The code below shows how to find multiple GPU devices on a system and submit different kernels to different GPU devices\n", + "\n", + "```cpp\n", + " // Get all GPU device in platform\n", + " auto gpus = sycl::platform(sycl::gpu_selector_v).get_devices();\n", + "\n", + " // create a vector for queue\n", + " std::vector q;\n", + " for (auto &gpu : gpus) {\n", + " // create queue for each device and add to vector\n", + " q.push_back(queue(gpu));\n", + " }\n", + "\n", + " // Submit kernels to multiple GPUs\n", + " if (gpus.size() >= 2){\n", + " q[0].parallel_for(N, [=](auto i){\n", + " //...\n", + " });\n", + "\n", + " q[1].parallel_for(N, [=](auto i){\n", + " //...\n", + " });\n", + " }\n", + "```" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/DirectProgramming/C++SYCL/Jupyter/oneapi-essentials-training/02_SYCL_Program_Structure/lab/vector_add_usm_buffers.cpp b/DirectProgramming/C++SYCL/Jupyter/oneapi-essentials-training/02_SYCL_Program_Structure/lab/vector_add_usm_buffers.cpp new file mode 100755 index 0000000000..609bf804d9 --- /dev/null +++ b/DirectProgramming/C++SYCL/Jupyter/oneapi-essentials-training/02_SYCL_Program_Structure/lab/vector_add_usm_buffers.cpp @@ -0,0 +1,78 @@ +//============================================================== +// Copyright © Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include + +using namespace sycl; + +// kernel function to compute vector add using Unified Shared memory model (USM) +void kernel_usm(int* a, int* b, int* c, int N) { + //Step 1: create a device queue + queue q; + //Step 2: create USM device allocation + auto a_device = malloc_device(N, q); + auto b_device = malloc_device(N, q); + auto c_device = malloc_device(N, q); + //Step 3: copy memory from host to device + q.memcpy(a_device, a, N*sizeof(int)); + q.memcpy(b_device, b, N*sizeof(int)); + q.wait(); + //Step 4: send a kernel (lambda) for execution + q.parallel_for(N, [=](auto i){ + //Step 5: write a kernel + c_device[i] = a_device[i] + b_device[i]; + }).wait(); + //Step 6: copy the result back to host + q.memcpy(c, c_device, N*sizeof(int)).wait(); + //Step 7: free device allocation + free(a_device, q); + free(b_device, q); + free(c_device, q); +} + +// kernel function to compute vector add using Buffer memory model +void kernel_buffers(int* a, int* b, int* c, int N) { + //Step 1: create a device queue + queue q; + //Step 2: create buffers + buffer buf_a(a, range<1>(N)); + buffer buf_b(b, range<1>(N)); + buffer buf_c(c, range<1>(N)); + //Step 3: submit a command for (asynchronous) execution + q.submit([&](handler &h){ + //Step 4: create buffer accessors to access buffer data on the device + accessor A(buf_a, h, read_only); + accessor B(buf_b, h, read_only); + accessor C(buf_c, h, write_only); + //Step 5: send a kernel (lambda) for execution + h.parallel_for(N, [=](auto i){ + //Step 6: write a kernel + C[i] = A[i] + B[i]; + }); + }); +} + +int main() { + // initialize data arrays on host + constexpr int N = 256; + int a[N], b[N], c[N]; + for (int i=0; i /dev/null 2>&1 +/bin/echo "##" $(whoami) is compiling DPCPP_Essentials Module2 -- SYCL Program Structure sample - 8 of 8 vector_add_usm_buffers.cpp +icpx -fsycl lab/vector_add_usm_buffers.cpp +if [ $? -eq 0 ]; then ./a.out; fi + diff --git a/DirectProgramming/C++SYCL/Jupyter/oneapi-essentials-training/02_SYCL_Program_Structure/src/vector_add_usm_buffers.cpp b/DirectProgramming/C++SYCL/Jupyter/oneapi-essentials-training/02_SYCL_Program_Structure/src/vector_add_usm_buffers.cpp new file mode 100755 index 0000000000..609bf804d9 --- /dev/null +++ b/DirectProgramming/C++SYCL/Jupyter/oneapi-essentials-training/02_SYCL_Program_Structure/src/vector_add_usm_buffers.cpp @@ -0,0 +1,78 @@ +//============================================================== +// Copyright © Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include + +using namespace sycl; + +// kernel function to compute vector add using Unified Shared memory model (USM) +void kernel_usm(int* a, int* b, int* c, int N) { + //Step 1: create a device queue + queue q; + //Step 2: create USM device allocation + auto a_device = malloc_device(N, q); + auto b_device = malloc_device(N, q); + auto c_device = malloc_device(N, q); + //Step 3: copy memory from host to device + q.memcpy(a_device, a, N*sizeof(int)); + q.memcpy(b_device, b, N*sizeof(int)); + q.wait(); + //Step 4: send a kernel (lambda) for execution + q.parallel_for(N, [=](auto i){ + //Step 5: write a kernel + c_device[i] = a_device[i] + b_device[i]; + }).wait(); + //Step 6: copy the result back to host + q.memcpy(c, c_device, N*sizeof(int)).wait(); + //Step 7: free device allocation + free(a_device, q); + free(b_device, q); + free(c_device, q); +} + +// kernel function to compute vector add using Buffer memory model +void kernel_buffers(int* a, int* b, int* c, int N) { + //Step 1: create a device queue + queue q; + //Step 2: create buffers + buffer buf_a(a, range<1>(N)); + buffer buf_b(b, range<1>(N)); + buffer buf_c(c, range<1>(N)); + //Step 3: submit a command for (asynchronous) execution + q.submit([&](handler &h){ + //Step 4: create buffer accessors to access buffer data on the device + accessor A(buf_a, h, read_only); + accessor B(buf_b, h, read_only); + accessor C(buf_c, h, write_only); + //Step 5: send a kernel (lambda) for execution + h.parallel_for(N, [=](auto i){ + //Step 6: write a kernel + C[i] = A[i] + B[i]; + }); + }); +} + +int main() { + // initialize data arrays on host + constexpr int N = 256; + int a[N], b[N], c[N]; + for (int i=0; i