From 699717ebbbec50603861aed70a69f628a26f3b8a Mon Sep 17 00:00:00 2001 From: "Saiapova, Natalia" Date: Wed, 9 Nov 2022 19:15:50 +0000 Subject: [PATCH] Tools/ApplicationDebugger: update jacobi sample. * Restructure the source code: - the common parts are moved to jacobi.cpp; - the computation parts have two flavors: - bugged.cpp contains computation with bugs; - fixed.cpp contains fixed computation; * Simplify reductions; * Update README. Signed-off-by: Saiapova, Natalia --- .../ApplicationDebugger/jacobi/CMakeLists.txt | 5 +- Tools/ApplicationDebugger/jacobi/README.md | 336 +++++++++++------- .../ApplicationDebugger/jacobi/src/bugged.cpp | 151 ++++++++ .../ApplicationDebugger/jacobi/src/fixed.cpp | 169 +++++++++ .../jacobi/src/jacobi-bugged.cpp | 303 ---------------- .../jacobi/src/jacobi-fixed.cpp | 322 ----------------- .../ApplicationDebugger/jacobi/src/jacobi.cpp | 168 +++++++++ 7 files changed, 690 insertions(+), 764 deletions(-) create mode 100644 Tools/ApplicationDebugger/jacobi/src/bugged.cpp create mode 100644 Tools/ApplicationDebugger/jacobi/src/fixed.cpp delete mode 100644 Tools/ApplicationDebugger/jacobi/src/jacobi-bugged.cpp delete mode 100644 Tools/ApplicationDebugger/jacobi/src/jacobi-fixed.cpp create mode 100644 Tools/ApplicationDebugger/jacobi/src/jacobi.cpp diff --git a/Tools/ApplicationDebugger/jacobi/CMakeLists.txt b/Tools/ApplicationDebugger/jacobi/CMakeLists.txt index ec79828b7f..21ba5403c1 100755 --- a/Tools/ApplicationDebugger/jacobi/CMakeLists.txt +++ b/Tools/ApplicationDebugger/jacobi/CMakeLists.txt @@ -31,8 +31,9 @@ endif () set (PROJECT_NAME_BUGGED "${PROJECT_NAME}-bugged") set (PROJECT_NAME_FIXED "${PROJECT_NAME}-fixed") -add_executable (${PROJECT_NAME_BUGGED} src/jacobi-bugged.cpp) -add_executable (${PROJECT_NAME_FIXED} src/jacobi-fixed.cpp) +add_executable (${PROJECT_NAME_BUGGED} src/jacobi.cpp) +add_executable (${PROJECT_NAME_FIXED} src/jacobi.cpp) +target_compile_definitions (${PROJECT_NAME_FIXED} PRIVATE FIXED) # Add custom target for starting a debug session add_custom_target (debug-session diff --git a/Tools/ApplicationDebugger/jacobi/README.md b/Tools/ApplicationDebugger/jacobi/README.md index 22fbefe161..468c0b6cb8 100755 --- a/Tools/ApplicationDebugger/jacobi/README.md +++ b/Tools/ApplicationDebugger/jacobi/README.md @@ -10,7 +10,9 @@ for the debugger. This sample contains two versions of the same program: `jacobi-bugged` and `jacobi-fixed`. The latter works correctly, but in the former several bugs are -injected. You can try to find and fix them using the debugger. +injected. You can try to find and fix them using the debugger. The debug steps +follow a common strategy that attempts to resolve bugs first on the CPU, +then focus on possibly more difficult GPU-oriented bugs. | Area | Description |:--- |:--- @@ -64,112 +66,39 @@ devices. Use the ONEAPI_DEVICE_SELECTOR environment variable to select device. The default device is Level Zero GPU device, if available. For more details on possible values of this variable see [Environment Variables](https://intel.github.io/llvm-docs/EnvironmentVariables.html#oneapi-device-selector). -For an overview of the Jacobi method, please refer to the Wikipedia article on the [Jacobi method](https://en.wikipedia.org/wiki/Jacobi_method). +For an overview of the Jacobi method, please refer to the Wikipedia article +on the [Jacobi method](https://en.wikipedia.org/wiki/Jacobi_method). -For an overview of the debugger, please refer to [Get Started with Intel® Distribution for GDB* on Linux* OS Host](https://www.intel.com/content/www/us/en/develop/documentation/get-started-with-debugging-dpcpp-linux/top.html). - -## Recommended commands - -For checking variables values you can use: `print`, `printf`, `display`, `info locals`. - -To define specific actions when a BP is hit, use `commands`, e.g. - -``` -(gdb) commands 1 -Type commands for breakpoint(s) 1, one per line. -End with a line saying just "end". ->silent ->print var1 ->continue ->end -``` - -sets actions for breakpoint 1. At each hit of the breakpoint, the variable `var1` -will be printed and then the debugger will continue until the next breakpoint hit. -Here we start the command list with `silent` -- it helps to suppress GDB output -about hitting the BP. - -On GPU all threads execute SIMD. To apply the command to every SIMD lane -that hit the BP, add `/a` modifier. In the following we print local variable `gid` -for every SIMD lane that hits the breakpoint at line 130: - -``` -(gdb) break 130 -Breakpoint 1 at 0x4125ac: file jacobi-bugged.cpp, line 130. -(gdb) commands /a -Type commands for breakpoint(s) 1, one per line. -Commands will be applied to all hit SIMD lanes. -End with a line saying just "end". ->print gid ->end -(gdb) running -<... GDB output omitted...> -[Switching to Thread 1.1073741824 lane 0] - -Thread 2.1 hit Breakpoint 1, with SIMD lanes [0-7], prepare_for_next_iteration(...arguments omitted...) at jacobi-bugged.cpp:130 -130 acc_l1_norm_x_k1 += abs(acc_x_k1[gid]); // Bug 2 challenge: breakpoint here. -$1 = cl::sycl::id<1> = {8} -$2 = cl::sycl::id<1> = {9} -$3 = cl::sycl::id<1> = {10} -$4 = cl::sycl::id<1> = {11} -$5 = cl::sycl::id<1> = {12} -$6 = cl::sycl::id<1> = {13} -$7 = cl::sycl::id<1> = {14} -$8 = cl::sycl::id<1> = {15} -``` - -You can apply a command to specified threads and SIMD lanes with `thread apply`. -The following command prints `gid` for each active SIMD lane of the current thread: - -``` -(gdb) thread apply :* -q print gid -$9 = cl::sycl::id<1> = {8} -$10 = cl::sycl::id<1> = {9} -$11 = cl::sycl::id<1> = {10} -$12 = cl::sycl::id<1> = {11} -$13 = cl::sycl::id<1> = {12} -$14 = cl::sycl::id<1> = {13} -$15 = cl::sycl::id<1> = {14} -$16 = cl::sycl::id<1> = {15} -``` - -`-q` flag suppresses the additional output such as for which thread and lane -the command was executed. - -You can set a convenience variable and then modify it: - -``` -(gdb) set $temp=1 -(gdb) print $temp -$1 = 1 -(gdb) print ++$temp -$2 = 2 -``` - -If you want to step through the program, use: `step`, `next`. Ensure that -scheduler locking is set to `step` or `on` (otherwise, you will be switched -to a different thread while stepping): - -``` -(gdb) set scheduler-locking step -``` +For an overview of the debugger, please refer to +[Get Started with Intel® Distribution for GDB* on Linux* OS Host](https://www.intel.com/content/www/us/en/develop/documentation/get-started-with-debugging-dpcpp-linux/top.html). ## Key Implementation Details Jacobi method is an iterative solver for a system of linear equations. The system must be strictly diagonally dominant to ensure that the method converges to the solution. In our case, the matrix is hardcoded into the kernel from -`main_computation` function. +`compute_x_k1`. All computations happen inside a while-loop. There are two exit criteria -from the loop. First, if the relative error falls below the desired tolerance -we consider it as a success and the algorithm converged. Second, if we exceed -the maximum number of iterations we consider it as a fail, the algorithm -did not converge. +from the loop: +* *success* if the relative error falls below the desired tolerance, + the algorithm converged. +* *fail* if we exceed the maximum number of iterations, the algorithm + did not converge. + +There are 3 files in the solution. + +| File | Description +|:--- |:--- +| jacobi.cpp | Contains the `main` function, no bugs:
* initialize the problem
* setup SYCL boiler plate
call the `iterate` function
validate the result +| bugged.cpp | Contains bugged versions of `iterate`, `compute_x_k1`, and `prepare_for_next_iteration` functions.
This file is included only into `jacobi-bugged` program. +| fixed.cpp | Contains fixed versions of `iterate`, `compute_x_k1`, and `prepare_for_next_iteration` functions.
This file is included only into `jacobi-fixed` program. -Each iteration has two parts: `main_computation` and `prepare_for_next_iteration`. +The iteration loop is located at `iterate` function. +Each iteration has two parts: `compute_x_k1` and `prepare_for_next_iteration`. +All intended bugs are located in the `bugged.cpp` file. -### Main computation +### Update x_k1 vector (`compute_x_k1`) Here we compute the resulting vector of this iteration `x^{k+1}`. @@ -203,7 +132,7 @@ L1 norm of a vector is the sum of absolute values of its elements. To compute L1 norms of `||x_k - x_k1||_1` and `||x_k1||_1` we use sum reduction algorithm. This happens in `prepare_for_next_iteration`. -The computation of the final relative error happens in `main`: +The computation of the final relative error happens in `iterate`: we divide the absolute error `abs_error` by the L1 norm of x_k1 (`l1_norm_x_k1`). ### Prepare for the next iteration @@ -213,40 +142,31 @@ to compute L1 norms of `x_k - x_k1` and `x_k1` respectively. Second, we update `x_k` with the new value from `x_k1`. -## Building and Running the `jacobi` Project -> **Note**: If you have not already done so, set up your CLI -> environment by sourcing the `setvars` script located in -> the root of your oneAPI installation. +## Set Environment Variables + +When working with the command-line interface (CLI), you should configure +the oneAPI toolkits using environment variables. Set up your CLI environment +by sourcing the `setvars` script every time you open a new terminal window. +This practice ensures that your compiler, libraries, and tools are ready +for development. + +> **Note**: The `setvars` script is located in the root of your +> oneAPI installation. > > Linux*: > - For system wide installations: `. /opt/intel/oneapi/setvars.sh` > - For private installations: `. ~/intel/oneapi/setvars.sh` > - For non-POSIX shells, like csh, use the following command: `$ bash -c 'source /setvars.sh ; exec csh'` > -> For more information on configuring environment variables, see [Use the setvars Script with Linux* or MacOS*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/oneapi-development-environment-setup/use-the-setvars-script-with-linux-or-macos.html) +> For more information on configuring environment variables, see [Use the setvars Script with Linux* or MacOS*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/oneapi-development-environment-setup/use-the-setvars-script-with-linux-or-macos.html). -### Setup +## Build the `jacobi-bugged` and `jacobi-fixed` programs Preliminary setup steps are needed for the debugger to function. Please see the setup instructions in the Get Started Guide [Get Started with Intel® Distribution for GDB* on Linux* OS Host](https://www.intel.com/content/www/us/en/develop/documentation/get-started-with-debugging-dpcpp-linux/) -### Running Samples In Intel® DevCloud - -If running a sample in the Intel® DevCloud, remember that you must specify the -compute node (CPU, GPU, FPGA) and whether to run in batch or interactive mode. -We recommend running the `jacobi` sample on a node with GPU. In order to have an -interactive debugging session, we recommend using the interactive mode. To get -the setting, after connecting to the login node, type the following command: - -``` -$ qsub -I -l nodes=1:gpu:ppn=2 -``` -Within the interactive session on the GPU node, build and run the sample. -For more information, see the Intel® oneAPI Base Toolkit Get Started Guide -(https://devcloud.intel.com/oneapi/get-started/base-toolkit/). - -### Using Visual Studio Code* (Optional) +### Using Visual Studio Code* (VS Code) (Optional) You can use Visual Studio Code (VS Code) extensions to set your environment, create launch configurations, and browse and download samples. @@ -260,11 +180,26 @@ The basic steps to build and run a sample using VS Code include: To learn more about the extensions and how to configure the oneAPI environment, see the [Using Visual Studio Code with Intel® oneAPI Toolkits User Guide](https://software.intel.com/content/www/us/en/develop/documentation/using-vs-code-with-intel-oneapi/top.html). -### On a Linux* System +### Using Intel® DevCloud + +If running a sample in the Intel® DevCloud, remember that you must specify the +compute node (CPU, GPU, FPGA) and whether to run in batch or interactive mode. + +We recommend running the `jacobi` sample on a node with GPU. In order to have an +interactive debugging session, we recommend using the interactive mode. To get +the setting, after connecting to the login node, type the following command: + +``` +$ qsub -I -l nodes=1:gpu:ppn=2 +``` + +Within the interactive session on the GPU node, build and run the sample. +For more information, see the Intel® oneAPI Base Toolkit Get Started Guide +(https://devcloud.intel.com/oneapi/get-started/base-toolkit/). -Perform the following steps: +### Build on a Linux* System -1. Build the project using the following `cmake` commands. +Build the project using the following `cmake` commands. ``` $ cd jacobi @@ -274,10 +209,16 @@ Perform the following steps: $ make ``` - This builds both `jacobi-bugged` and `jacoby-fixed` versions of the program. + This builds both `jacobi-bugged` and `jacobi-fixed` versions of the program. > Note: The cmake configuration enforces the `Debug` build type. -2. Run the buggy program: +### Setup the Debugger + +Preliminary setup steps are needed for the debugger to function. +Please see the setup instructions in the Get Started Guide +[Get Started with Intel® Distribution for GDB* on Linux* OS Host](https://www.intel.com/content/www/us/en/develop/documentation/get-started-with-debugging-dpcpp-linux/) + +### Run the buggy program ``` $ ./jacobi-bugged @@ -295,21 +236,138 @@ Perform the following steps: $ ONEAPI_DEVICE_SELECTOR=*:cpu gdb-oneapi jacobi-bugged ``` -4. Clean the program using: +4. Clean the program (optional): ``` $ make clean ``` -For instructions about starting and using the debugger, please see -[Get Started with Intel® Distribution for GDB* on Linux* OS Host](https://www.intel.com/content/www/us/en/develop/documentation/get-started-with-debugging-dpcpp-linux/). +For instructions about starting and using the debugger on Linux* OS Host, +please see +- [Tutorial: Debug a SYCL* Application on a CPU](https://www.intel.com/content/www/us/en/develop/documentation/debugging-dpcpp-linux/top/debug-a-sycl-application-on-a-cpu.html) +- [Tutorial: Debug a SYCL* Application on a GPU](https://www.intel.com/content/www/us/en/develop/documentation/debugging-dpcpp-linux/top/debug-a-sycl-application-on-a-gpu.html) + +## Guided Debugging + +The below instructions provide step by step instructions for locating and +resolving the three bugs in the `jacobi` sample, as well as basic usage of +the debbuger. + +### Recommended Commands + +For checking variables values you can use: `print`, `printf`, `display`, +`info locals`. + +#### `commands` command + +To define specific actions when a BP is hit, use `commands `, +e.g. + +``` +(gdb) commands 1 +Type commands for breakpoint(s) 1, one per line. +End with a line saying just "end". +>silent +>print var1 +>continue +>end +``` + +The above sets actions for breakpoint 1. At each hit of the breakpoint, +the variable `var1` will be printed and then the debugger will continue until +the next breakpoint hit. Here we start the command list with `silent` -- it +helps to suppress GDB output about hitting the BP. + +On GPU all threads execute SIMD. To apply the command to every SIMD lane +that hit the BP, add the `/a` modifier. In the following we print the local +variable `gid` for every SIMD lane that hits the breakpoint at `x_k1_kernel`: + +``` +(gdb) break compute_x_k1_kernel +Breakpoint 1 at 0x4048e8: file bugged.cpp, line 18. +(gdb) commands /a +Type commands for breakpoint(s) 1, one per line. +Commands will be applied to all hit SIMD lanes. +End with a line saying just "end". +>print index +>end +(gdb) running +<... GDB output omitted...> + +[Switching to Thread 1.153 lane 0] + +Thread 2.153 hit Breakpoint 1, with SIMD lanes [0-7], compute_x_k1_kernel (index=sycl::_V1::id<1> = {...}, b=0xffffb557aa530000, x_k=0xffffb557aa530200, x_k1=0xffffb557aa530400) at bugged.cpp:18 +18 int i = index[0]; +$1 = sycl::_V1::id<1> = {56} +$2 = sycl::_V1::id<1> = {57} +$3 = sycl::_V1::id<1> = {58} +$4 = sycl::_V1::id<1> = {59} +$5 = sycl::_V1::id<1> = {60} +$6 = sycl::_V1::id<1> = {61} +$7 = sycl::_V1::id<1> = {62} +$8 = sycl::_V1::id<1> = {63} +``` + +#### `thread apply` + +You can apply a command to specified threads and SIMD lanes with `thread apply`. +The following command prints `index` for each active SIMD lane of the current +thread: + +``` +(gdb) thread apply :* -q print index +$9 = sycl::_V1::id<1> = {56} +$10 = sycl::_V1::id<1> = {57} +$11 = sycl::_V1::id<1> = {58} +$12 = sycl::_V1::id<1> = {59} +$13 = sycl::_V1::id<1> = {60} +$14 = sycl::_V1::id<1> = {61} +$15 = sycl::_V1::id<1> = {62} +$16 = sycl::_V1::id<1> = {63} +``` + +The `-q` flag suppresses the additional output such as for which thread and lane +the command was executed. + +#### User-defined Convenience Variables + +You can define a convenience variable and use it for computations from +the debugger. + +In the following we define a variable `$temp` and set its initial value to `1`. +Then we increment its value. +``` +(gdb) set $temp=1 +(gdb) print $temp +$1 = 1 +(gdb) print ++$temp +$2 = 2 +``` + +#### Stepping through the program + +If you want to step through the program, use: `step`, `next`. Ensure that +scheduler locking is set to `step` or `on` (otherwise, you will be switched +to a different thread while stepping): -### Example Outputs +``` +(gdb) set scheduler-locking step +``` + +### Debugging `jacobi-bugged` -The selected device is displayed in the output. +Again, to try to isolate our bugs, we'll focus first on the CPU. This is +a common strategy. You can specify the device for offloading the kernels +using `ONEAPI_DEVICE_SELECTOR` variable, for example: -#### CPU +``` +$ ONEAPI_DEVICE_SELECTOR=*:cpu ./myApplication`. +``` + +The above limits the kernel offloading only to CPU devices. + +#### 1. Run the application on CPU When run, the original `jacobi-bugged` program shows the first bug: @@ -326,6 +384,8 @@ fail; Bug 1. Fix this on CPU: components of x_k are not close to 1.0. Hint: figure out which elements are farthest from 1.0. ``` +### 2. Locate the second CPU bug + Once the first bug is fixed, the second bug becomes visible: ``` @@ -352,7 +412,8 @@ Challenge: in the debugger you can simmulate the computation of a reduced ``` Once this bug is fixed, while offloading to CPU you receive the correct result: -which are the same as in `jacobi-fixed` for the offload to CPU device: +which are the same as in `jacobi-fixed` for the offload to CPU device. Since we're +only running on the CPU, we don't see the GPU bug: ``` $ ONEAPI_DEVICE_SELECTOR=*:cpu ./jacobi-fixed @@ -365,7 +426,7 @@ success; all elements of the resulting vector are close to 1.0. success; the relative error (9.97509e-05) is below the desired tolerance 0.0001 after 51 iterations. ``` -#### GPU +#### 3. Run debug on the GPU Start debugging GPU only after first two bugs are fixed on CPU. To debug on Intel GPU device, the device must be Level Zero. The Level @@ -377,7 +438,7 @@ Bug 3 is immediately hit while offloading to GPU: ``` $ ONEAPI_DEVICE_SELECTOR=level_zero:gpu ./jacobi-bugged -[SYCL] Using device: [Intel(R) Iris(R) Plus Graphics 650 [0x5927]] from [Intel(R) Level-Zero] +[SYCL] Using device: [Intel(R) Graphics [0x56c1]] from [Intel(R) Level-Zero] fail; Bug 3. Fix it on GPU. The relative error has invalid value after iteration 0. Hint 1: inspect reduced error values. With the challenge scenario @@ -385,7 +446,9 @@ Hint 1: inspect reduced error values. With the challenge scenario the correct values inside kernel on GPU. Take into account SIMD lanes: on GPU each thread processes several work items at once, so you need to modify your commands and update - the convenience variable for each SIMD lane. + the convenience variable for each SIMD lane, e.g. using + `thread apply :*`. + Hint 2: why don't we get the correct values at the host part of the application? ``` @@ -395,7 +458,7 @@ as for `jacobi-fixed` with the offload to GPU device: ``` $ ONEAPI_DEVICE_SELECTOR=level_zero:gpu ./jacobi-fixed -[SYCL] Using device: [Intel(R) Iris(R) Plus Graphics 650 [0x5927]] from [Intel(R) Level-Zero] +[SYCL] Using device: [Intel(R) Graphics [0x56c1]] from [Intel(R) Level-Zero] Iteration 0, relative error = 2.7581 Iteration 20, relative error = 0.119557 Iteration 40, relative error = 0.0010374 @@ -404,7 +467,7 @@ success; all elements of the resulting vector are close to 1.0. success; the relative error (9.97509e-05) is below the desired tolerance 0.0001 after 51 iterations. ``` -#### FPGA Emulation: +#### Debugging on FPGA Emulation: While offloading to FPGA emulation device, only first two bugs appear (similar to CPU): @@ -544,7 +607,6 @@ success; the relative error (9.97509e-05) is below the desired tolerance 0.0001 optional `-force` flag to force the condition to be defined even when `exp` is invalid for the current locations of the breakpoint. Useful for defining conditions involving JIT-produced artificial variables. - E.g.: `cond -force 1 __ocl_dbg_gid0 == 19`. --- diff --git a/Tools/ApplicationDebugger/jacobi/src/bugged.cpp b/Tools/ApplicationDebugger/jacobi/src/bugged.cpp new file mode 100644 index 0000000000..8d89bcae3e --- /dev/null +++ b/Tools/ApplicationDebugger/jacobi/src/bugged.cpp @@ -0,0 +1,151 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +// This file is included only into jacobi-bugged program. +// This file contains methods where actual computation takes place. +// Here we injected three bugs, that can be investigated with the help +// of the debugger. + + +// Compute x_k1 and write the result to its accessor. + +void compute_x_k1_kernel (id<1> &index, float *b, + float *x_k, float *x_k1) { + // Current index. + int i = index[0]; + + // The vector x_k1 should be computed as: + // x_k1 = D^{-1}(b - (A - D)x_k), + // where A is our matrix, D is its diagonal, b is right hand + // side vector, and x_k is the result of the previous iteration. + // + // Matrices (A - D) and D are hardcoded as: + // (A - D) is a stencil matrix [1 1 0 1 1]; + // D is a diagonal matrix with all elements equal to 5. + + float result = b[i]; + + // Non-diagonal elements of matrix A are all 1s, so to substract + // i-th element of (A - D)x_k, we need to substract the sum of elements + // of x_k with indices i - 2, i - 1, i + 1, i + 2. We do not substract + // the i-th element, as it gets multiplied by 0 in (A - D)x_k. + result -= x_k[i - 2]; + result -= x_k[i - 1]; + result -= x_k[i + 1]; + result -= x_k[i + 2]; + + // In our case the diagonal matrix has only 5s on the diagonal, so + // division by 5 gives us its invert. + result /= 5; + + // Save the value to the output buffer. + x_k1[i] = result; +} + +// Submits the kernel which updates x_k1 at every iteration. + +void compute_x_k1 (queue &q, buffer_args &buffers) { + q.submit([&](auto &h) { + accessor acc_b(buffers.b, h, read_only); + accessor acc_x_k(buffers.x_k, h, read_only); + accessor acc_x_k1(buffers.x_k1, h, write_only); + + h.parallel_for(range{n}, [=](id<1> index) { + compute_x_k1_kernel (index, acc_b.get_pointer(), acc_x_k.get_pointer(), + acc_x_k1.get_pointer()); + }); + }); +} + +// Here we compute values which are used for relative error computation +// and copy the vector x_k1 over the vector x_k. + +void prepare_for_next_iteration (queue &q, buffer_args &buffers) { + constexpr size_t l = 16; + + q.submit([&](auto &h) { + accessor acc_abs_error(buffers.abs_error, h, read_write); + accessor acc_x_k(buffers.x_k, h, read_write); + accessor acc_x_k1(buffers.x_k1, h, read_only); + + // To compute the relative error we need to prepare two values: + // absolute error and L1-norm of x_k1. + // Absolute error is computed as L1-norm of (x_k - x_k1). + // To compute the L1-norms of x_k1 and (x_k - x_k1) vectors + // we use the reduction API with std::plus operator. + auto r_abs_error = reduction(buffers.abs_error, h, std::plus<>()); + auto r_l1_norm_x_k1 = reduction(buffers.l1_norm_x_k1, h, std::plus<>()); + + h.parallel_for(nd_range<1>{n, l}, + r_abs_error, r_l1_norm_x_k1, + [=](nd_item<1> index, auto &abs_error, auto &l1_norm_x_k1) { + auto gid = index.get_global_id(); + float x_k = acc_x_k[gid]; + float x_k1 = acc_x_k1[gid]; + + // Execute reduction sums. + float local_abs_error = abs(x_k - x_k1); + abs_error += local_abs_error; // Bug 2 challenge: breakpoint here. + l1_norm_x_k1 += abs(x_k1); + + // Copy the vector x_k1 over x_k. + acc_x_k[gid] = x_k1; + }); + }); +} + +// Iterate until the algorithm converges (success) or the maximum number +// of iterations is reached (fail). + +int iterate(queue &q, float *b, float *x_k, float *x_k1, float &rel_error) { + // Absolute error, ||x_k - x_k1||_1, L1-norm of (x_k - x_k1). + float abs_error = 0; + // ||x_k1||_1, L1-norm of x_k1. + float l1_norm_x_k1 = 0; + + int k = 0; + + // Jacobi iteration begins. + do {// k-th iteration of Jacobi. + // Create SYCL buffers. + buffer_args buffers {b, x_k, x_k1, &l1_norm_x_k1, &abs_error}; + + compute_x_k1(q, buffers); + prepare_for_next_iteration(q, buffers); + q.wait_and_throw(); + + // Compute relative error based on reduced values from this iteration. + rel_error = abs_error / (l1_norm_x_k1 + 1e-32); + + if (abs_error < 0 || l1_norm_x_k1 < 0 + || (abs_error + l1_norm_x_k1) < 1e-32) { + cout << "\nfail; Bug 3. Fix it on GPU. The relative error has invalid value " + << "after iteration " << k << ".\n" + << "Hint 1: inspect reduced error values. With the challenge scenario\n" + << " from bug 2 you can verify that reduction algorithms compute\n" + << " the correct values inside kernel on GPU. Take into account\n" + << " SIMD lanes: on GPU each thread processes several work items\n" + << " at once, so you need to modify your commands and update\n" + << " the convenience variable for each SIMD lane, e.g. using \n" + << " `thread apply :*`.\n" + << "Hint 2: why don't we get the correct values at the host part of\n" + << " the application?\n"; + exit(0); + } + + // Periodically print out how the algorithm behaves. + if (k % 20 == 0) { + std::cout << "Iteration " << k << ", relative error = " + << rel_error << "\n"; + } + + k++; + } while (rel_error > tolerance && k < max_number_of_iterations); + // Jacobi iteration ends. + + return k; +} + diff --git a/Tools/ApplicationDebugger/jacobi/src/fixed.cpp b/Tools/ApplicationDebugger/jacobi/src/fixed.cpp new file mode 100644 index 0000000000..c6a327c863 --- /dev/null +++ b/Tools/ApplicationDebugger/jacobi/src/fixed.cpp @@ -0,0 +1,169 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +// This file is included only into jacobi-fixed program. +// This file contains methods where actual computation takes place. +// Here we explain and show how to fix Bugs 1, 2, and 3. + + +// Compute x_k1 and write the result to its accessor. + +void compute_x_k1_kernel (id<1> &index, float *b, + float *x_k, float *x_k1) { + // Current index. + int i = index[0]; + + // The vector x_k1 should be computed as: + // x_k1 = D^{-1}(b - (A - D)x_k), + // where A is our matrix, D is its diagonal, b is right hand + // side vector, and x_k is the result of the previous iteration. + // + // Matrices (A - D) and D are hardcoded as: + // (A - D) is a stencil matrix [1 1 0 1 1]; + // D is a diagonal matrix with all elements equal to 5. + + float result = b[i]; + + // Non-diagonal elements of matrix A are all 1s, so to substract + // i-th element of (A - D)x_k, we need to substract the sum of elements + // of x_k with indices i - 2, i - 1, i + 1, i + 2. We do not substract + // the i-th element, as it gets multiplied by 0 in (A - D)x_k. + + // Fix Bug 1: out-of-bounds access: all indices below can trigger + // out-of-bounds access and thus garbage values will be read. + // Fix it by adding checks that the index exists: + if (i > 1) + result -= x_k[i - 2]; + if (i > 0) + result -= x_k[i - 1]; + if (i < n - 1) + result -= x_k[i + 1]; + if (i < n - 2) + result -= x_k[i + 2]; + + // In our case the diagonal matrix has only 5s on the diagonal, so + // division by 5 gives us its invert. + result /= 5; + + // Save the value to the output buffer. + x_k1[i] = result; +} + +// Submits the kernel which updates x_k1 at every iteration. + +void compute_x_k1 (queue &q, buffer_args &buffers) { + q.submit([&](auto &h) { + accessor acc_b(buffers.b, h, read_only); + accessor acc_x_k(buffers.x_k, h, read_only); + accessor acc_x_k1(buffers.x_k1, h, write_only); + + h.parallel_for(range{n}, [=](id<1> index) { + compute_x_k1_kernel (index, acc_b.get_pointer(), acc_x_k.get_pointer(), + acc_x_k1.get_pointer()); + }); + }); +} + +// Here we compute values which are used for relative error computation +// and copy the vector x_k1 over the vector x_k. + +void prepare_for_next_iteration (queue &q, buffer_args &buffers) { + constexpr size_t l = 16; + + q.submit([&](auto &h) { + accessor acc_abs_error(buffers.abs_error, h, read_write); + accessor acc_x_k(buffers.x_k, h, read_write); + accessor acc_x_k1(buffers.x_k1, h, read_only); + + // To compute the relative error we need to prepare two values: + // absolute error and L1-norm of x_k1. + // Absolute error is computed as L1-norm of (x_k - x_k1). + // To compute the L1-norms of x_k1 and (x_k - x_k1) vectors + // we use the reduction API with std::plus operator. + auto r_abs_error = reduction(buffers.abs_error, h, std::plus<>()); + auto r_l1_norm_x_k1 = reduction(buffers.l1_norm_x_k1, h, std::plus<>()); + + h.parallel_for(nd_range<1>{n, l}, + r_abs_error, r_l1_norm_x_k1, + [=](nd_item<1> index, auto &abs_error, auto &l1_norm_x_k1) { + auto gid = index.get_global_id(); + float x_k = acc_x_k[gid]; + float x_k1 = acc_x_k1[gid]; + + // Execute reduction sums. + float local_abs_error = abs(x_k - x_k1); + abs_error += local_abs_error; // Bug 2 challenge: breakpoint here. + l1_norm_x_k1 += abs(x_k1); + + // Copy the vector x_k1 over x_k. + acc_x_k[gid] = x_k1; + }); + }); +} + +// Iterate until the algorithm converges (success) or the maximum number +// of iterations is reached (fail). + +int iterate(queue &q, float *b, float *x_k, float *x_k1, float &rel_error) { + // Absolute error, ||x_k - x_k1||_1, L1-norm of (x_k - x_k1). + float abs_error = 0; + // ||x_k1||_1, L1-norm of x_k1. + float l1_norm_x_k1 = 0; + + int k = 0; + + // Jacobi iteration begins. + do {// k-th iteration of Jacobi. + // Fix bug 2: we have to reset the error values at each iteration, otherwise + // the relative error accumulates through iterations and does not fall + // below the tolerance. + abs_error = 0; + l1_norm_x_k1 = 0; + + { // Fix bug 3: the host values of abs_error and l1_norm_x_k1 + // were not synchronised with their new values on device. + // Open new scope for buffers. Once the scope is ended, the destructors + // of buffers will write the data back from device to host. + + // Create SYCL buffers. + buffer_args buffers {b, x_k, x_k1, &l1_norm_x_k1, &abs_error}; + + compute_x_k1(q, buffers); + prepare_for_next_iteration(q, buffers); + } + + // Compute relative error based on reduced values from this iteration. + rel_error = abs_error / (l1_norm_x_k1 + 1e-32); + + if (abs_error < 0 || l1_norm_x_k1 < 0 + || (abs_error + l1_norm_x_k1) < 1e-32) { + cout << "\nfail; Bug 3. Fix it on GPU. The relative error has invalid value " + << "after iteration " << k << ".\n" + << "Hint 1: inspect reduced error values. With the challenge scenario\n" + << " from bug 2 you can verify that reduction algorithms compute\n" + << " the correct values inside kernel on GPU. Take into account\n" + << " SIMD lanes: on GPU each thread processes several work items\n" + << " at once, so you need to modify your commands and update\n" + << " the convenience variable for each SIMD lane, e.g. using \n" + << " `thread apply :*`.\n" + << "Hint 2: why don't we get the correct values at the host part of\n" + << " the application?\n"; + exit(0); + } + + // Periodically print out how the algorithm behaves. + if (k % 20 == 0) { + std::cout << "Iteration " << k << ", relative error = " + << rel_error << "\n"; + } + + k++; + } while (rel_error > tolerance && k < max_number_of_iterations); + // Jacobi iteration ends. + + return k; +} + diff --git a/Tools/ApplicationDebugger/jacobi/src/jacobi-bugged.cpp b/Tools/ApplicationDebugger/jacobi/src/jacobi-bugged.cpp deleted file mode 100644 index 1ef8ca9741..0000000000 --- a/Tools/ApplicationDebugger/jacobi/src/jacobi-bugged.cpp +++ /dev/null @@ -1,303 +0,0 @@ -//============================================================== -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: MIT -// ============================================================= - -// The program solves the linear equation Ax=b, where matrix A is a -// n x n sparse matrix with diagonals [1 1 5 1 1], -// vector b is set such that the solution is [1 1 ... 1]^T. -// The linear system is solved via Jacobi iteration. -// The algorithm converges, as the matrix A is strictly diagonally dominant. - -#include -#include -#include -// Location of file: /dev-utilities//include -#include "dpc_common.hpp" - -using namespace std; -using namespace sycl; - -// Helper structure to initialize and hold all our SYCL buffers. -// Note: no bugs here. -struct buffer_args { - buffer b; - buffer x_k; - buffer x_k1; - buffer l1_norm_x_k1; - buffer abs_error; - buffer_args(size_t n, float *b, - float *x_k, float *x_k1, - float *l1_norm_x_k1, - float *abs_error): - // Right hand side vector b; - b(buffer(b, range{n})), - // Iteration vectors x_k and x_k1; - x_k(buffer(x_k, range{n})), - x_k1(buffer(x_k1, range{n})), - // Sum of absolute values of x_k1 elements. - l1_norm_x_k1(buffer(l1_norm_x_k1, range{1})), - // Absolute error. - abs_error(buffer(abs_error, range{1})) - {} -}; - -// Initialize right hand side vector b and the initial guess for x_k. -void initialize_input(float *b, float *x_k, size_t n); - -// At each iteration the computation of the resulting vector x happens here. -void main_computation (queue &q, buffer_args &buffers, size_t n) { - // Here, we compute the updated vector x_k1. - q.submit([&](auto &h) { - accessor acc_b(buffers.b, h, read_only); - accessor acc_x_k(buffers.x_k, h, read_only); - accessor acc_x_k1(buffers.x_k1, h, write_only); - - h.parallel_for(range{n}, [=](id<1> index) { - // Current index. - int i = index[0]; - - // The vector x_k1 should be computed as: - // x_k1 = D^{-1}(b - (A - D)x_k), - // where A is our matrix, D is its diagonal, b is right hand - // side vector, and x_k is the result of the previous iteration. - // - // Matrices (A - D) and D are hardcoded as: - // (A - D) is a stencil matrix [1 1 0 1 1]; - // D is a diagonal matrix with all elements equal to 5. - - float x_k1 = acc_b[i]; - - // Non-diagonal elements of matrix A are all 1s, so to substract - // i-th element of (A - D)x_k, we need to substract the sum of elements - // of x_k with indices i - 2, i - 1, i + 1, i + 2. We do not substract - // the i-th element, as it gets multiplied by 0 in (A - D)x_k. - x_k1 -= acc_x_k[i - 2]; - x_k1 -= acc_x_k[i - 1]; - x_k1 -= acc_x_k[i + 1]; - x_k1 -= acc_x_k[i + 2]; - - // In our case the diagonal matrix has only 5s on the diagonal, so - // division by 5 gives us its invert. - x_k1 /= 5; - - // Save the value to the output buffer. - acc_x_k1[index] = x_k1; - }); - }); -} - -// Here we compute values which are used for relative error computation -// and copy the vector x_k1 over the vector x_k. -void prepare_for_next_iteration (queue &q, buffer_args &buffers,size_t n) { - constexpr size_t l = 16; - - // To compute the relative error we need to prepare two values: - // abs_error and l1_norm_x_k1. - // - // First, we need to compute the L1-norm of (x_k - x_k1). To do that, - // we sum absolute values of its elements into abs_error buffer. - // We use a reduction algorithm here. - q.submit([&](auto &h) { - accessor acc_abs_error(buffers.abs_error, h, read_write); - accessor acc_x_k(buffers.x_k, h, read_only); - accessor acc_x_k1(buffers.x_k1, h, read_only); - auto r_abs_error = reduction(buffers.abs_error, h, std::plus<>()); - - h.parallel_for(nd_range<1>{n, l}, - r_abs_error, - [=](nd_item<1> index, auto& acc_abs_error) { - auto gid = index.get_global_id(); - - // Compute the sum. - acc_abs_error += abs(acc_x_k[gid] - acc_x_k1[gid]); - }); - }); - - // Second, we need to compute the L1 norm of x_k1. - // For that, we sum absolute values of all elements of vector x_k1 into - // l1_norm_x_k1 buffer. Again we use the reduction algorithm here. - q.submit([&](auto &h) { - accessor acc_x_k1(buffers.x_k1, h, read_only); - accessor acc_l1_norm_x_k1(buffers.l1_norm_x_k1, h, read_write); - auto r_l1_norm_x_k1 = reduction(buffers.l1_norm_x_k1, h, std::plus<>()); - - h.parallel_for(nd_range<1>{n, l}, - r_l1_norm_x_k1, - [=](nd_item<1> index, auto& acc_l1_norm_x_k1) { - auto gid = index.get_global_id(); - // Compute the sum. - acc_l1_norm_x_k1 += abs(acc_x_k1[gid]); // Bug 2 challenge: breakpoint here. - }); - }); - - // We copy the values of vector x_k1 to x_k in order to setup - // the next iteration. - q.submit([&](auto &h) { - accessor acc_x_k(buffers.x_k, h, write_only); - accessor acc_x_k1(buffers.x_k1, h, read_only); - - h.parallel_for(range{n}, [=](id<1> index) { - auto gid = index[0]; - - // Copy the vector x_k1 over x_k. - acc_x_k[gid] = acc_x_k1[gid]; - }); - }); -} - -int main(int argc, char *argv[]) { - // The size of the problem. - // The matrix A is n x n matrix, the length of the vector x is n. - constexpr size_t n = 64; - - // The maximum number of iterations the algorithm is going - // to take. - constexpr size_t max_number_of_iterations = 100; - - // We expect each element of vector x to be that close - // to the analitycal solution. - constexpr float tolerance = 1e-4; - - // The right hand side vector. - float b[n]; - - // We store an intermediate result after every iteration here. - float x_k[n]; - - // At each iteration we compute a new value of x here. We need - // both buffers x_k and x_k1 due to the data dependency: to compute - // one element at the iteration k + 1 we need several elements - // from the iteration k. - float x_k1[n]; - - // We will compute the relative error at each iteration as: - // - // ||x_k - x_k1||_1 abs_error - // rel_error = ------------------- = -------------- - // ||x_k1||_1 l1_norm_x_k1 - - // Absolute error, ||x_k - x_k1||_1, L1-norm of (x_k - x_k1). - float abs_error = 0; - // ||x_k1||_1, L1-norm of x_k1. - float l1_norm_x_k1 = 0; - // Relative error. - float rel_error; - - // Initialize the input. - // Note: the matrix A is hardcoded as a stencil matrix [1 1 5 1 1] - // into the kernel. - initialize_input(b, x_k, n); - - // Iteration counter. - int k = 0; - try { - queue q(default_selector_v, dpc_common::exception_handler); - cout << "[SYCL] Using device: [" - << q.get_device().get_info() << "] from [" - << q.get_device().get_platform().get_info() - << "]\n"; - - // Jacobi iteration begins. - do {// k-th iteration of Jacobi. - // Create SYCL buffers. - buffer_args buffers {n, b, x_k, x_k1, &l1_norm_x_k1, &abs_error}; - - main_computation(q, buffers, n); - prepare_for_next_iteration(q, buffers, n); - q.wait_and_throw(); - - // Compute relative error based on reduced values from this iteration. - rel_error = abs_error / (l1_norm_x_k1 + 1e-32); - - if (abs_error < 0 || l1_norm_x_k1 < 0 - || (abs_error + l1_norm_x_k1) < 1e-32) { - cout << "\nfail; Bug 3. Fix it on GPU. The relative error has invalid value " - << "after iteration " << k << ".\n" - << "Hint 1: inspect reduced error values. With the challenge scenario\n" - << " from bug 2 you can verify that reduction algorithms compute\n" - << " the correct values inside kernel on GPU. Take into account\n" - << " SIMD lanes: on GPU each thread processes several work items\n" - << " at once, so you need to modify your commands and update\n" - << " the convenience variable for each SIMD lane.\n" - << "Hint 2: why don't we get the correct values at the host part of\n" - << " the application?\n"; - return 0; - } - - // Periodically print out how the algorithm behaves. - if (k % 20 == 0) { - std::cout << "Iteration " << k << ", relative error = " - << rel_error << "\n"; - } - - k++; - } while (rel_error > tolerance && k < max_number_of_iterations); - // Jacobi iteration ends. - - } catch (sycl::exception const &e) { - cout << "fail; synchronous exception occurred: " << e.what() << "\n"; - return -1; - } - - // Verify the output, we expect a vector whose components are close to 1.0. - bool correct = true; - for (int i = 0; i < n; i++) { - if ((x_k[i] - 1.0f) * (x_k[i] - 1.0f) > tolerance) - correct = false; - } - - if (correct) - cout << "\nsuccess; all elements of the resulting vector are close to 1.0.\n"; - else { - cout << "\nfail; Bug 1. Fix this on CPU: components of x_k are not close to 1.0.\n" - << "Hint: figure out which elements are farthest from 1.0.\n"; - return 0; - } - - // Check whether the algorithm converged. - if (k < max_number_of_iterations) { - cout << "success; the relative error (" << rel_error - << ") is below the desired tolerance " - << tolerance <<" after " << k <<" iterations.\n\n"; - } else { - cout << "\nfail; Bug 2. Fix this on CPU: the relative error (" << rel_error - << ") is greater than\n" - << " the desired tolerance " - << tolerance <<" after " << max_number_of_iterations - << " iterations.\n" - << "Hint: check the reduction results at several iterations.\n" - << "Challenge: in the debugger you can simmulate the computation of a reduced\n" - << " value by putting a BP inside the corresponding kernel and defining\n" - << " a convenience variable. We will compute the reduced value at this\n" - << " convenience variable: at each BP hit we update it with a help of \"commands\"\n" - << " command. After the reduction kernel is finished, the convenience\n" - << " variable should contain the reduced value.\n" - << " See README for details.\n"; - return 0; - } - - return 0; -} - -// Note: no bugs here. -void initialize_input(float *b, float *x_k, size_t n) { - constexpr float main_b = 9; - - // Vector b and the matrix A are hardcoded - // such that the analytical solution of the system Ax=b is a vector - // whose elements are 1s. - for (int i = 0; i < n; i++) - b[i] = main_b; - - // Boundary values of the vector b. - b[0] = main_b - 2; - b[1] = main_b - 1; - b[n - 2] = main_b - 1; - b[n - 1] = main_b -2; - - // Initial guess of x is b. - for (int i = 0; i < n; i++) - x_k[i] = b[i]; -} diff --git a/Tools/ApplicationDebugger/jacobi/src/jacobi-fixed.cpp b/Tools/ApplicationDebugger/jacobi/src/jacobi-fixed.cpp deleted file mode 100644 index 255b5df2a0..0000000000 --- a/Tools/ApplicationDebugger/jacobi/src/jacobi-fixed.cpp +++ /dev/null @@ -1,322 +0,0 @@ -//============================================================== -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: MIT -// ============================================================= - -// The program solves the linear equation Ax=b, where matrix A is a -// n x n sparse matrix with diagonals [1 1 5 1 1], -// vector b is set such that the solution is [1 1 ... 1]^T. -// The linear system is solved via Jacobi iteration. -// The algorithm converges, as the matrix A is strictly diagonally dominant. - -#include -#include -#include -// Location of file: /dev-utilities//include -#include "dpc_common.hpp" - -using namespace std; -using namespace sycl; - -// Helper structure to initialize and hold all our SYCL buffers. -// Note: no bugs here. -struct buffer_args { - buffer b; - buffer x_k; - buffer x_k1; - buffer l1_norm_x_k1; - buffer abs_error; - buffer_args(size_t n, float *b, - float *x_k, float *x_k1, - float *l1_norm_x_k1, - float *abs_error): - // Right hand side vector b; - b(buffer(b, range{n})), - // Iteration vectors x_k and x_k1; - x_k(buffer(x_k, range{n})), - x_k1(buffer(x_k1, range{n})), - // Sum of absolute values of x_k1 elements. - l1_norm_x_k1(buffer(l1_norm_x_k1, range{1})), - // Absolute error. - abs_error(buffer(abs_error, range{1})) - {} -}; - -// Initialize right hand side vector b and the initial guess for x_k. -void initialize_input(float *b, float *x_k, size_t n); - -// At each iteration the computation of the resulting vector x happens here. -void main_computation (queue &q, buffer_args &buffers, size_t n) { - // Here, we compute the updated vector x_k1. - q.submit([&](auto &h) { - accessor acc_b(buffers.b, h, read_only); - accessor acc_x_k(buffers.x_k, h, read_only); - accessor acc_x_k1(buffers.x_k1, h, write_only); - - h.parallel_for(range{n}, [=](id<1> index) { - // Current index. - int i = index[0]; - - // The vector x_k1 should be computed as: - // x_k1 = D^{-1}(b - (A - D)x_k), - // where A is our matrix, D is its diagonal, b is right hand - // side vector, and x_k is the result of the previous iteration. - // - // Matrices (A - D) and D are hardcoded as: - // (A - D) is a stencil matrix [1 1 0 1 1]; - // D is a diagonal matrix with all elements equal to 5. - - float x_k1 = acc_b[i]; - - // Non-diagonal elements of matrix A are all 1s, so to substract - // i-th element of (A - D)x_k, we need to substract the sum of elements - // of x_k with indices i - 2, i - 1, i + 1, i + 2. We do not substract - // the i-th element, as it gets multiplied by 0 in (A - D)x_k. - // - // Fix bug 1: out-of-bounds access: all indices below can trigger - // out-of-bounds access and thus garbage values will be read. - // Fix it by adding checks that the index exists: - if (i > 1) - x_k1 -= acc_x_k[i - 2]; - if (i > 0) - x_k1 -= acc_x_k[i - 1]; - if (i < n - 1) - x_k1 -= acc_x_k[i + 1]; - if (i < n - 2) - x_k1 -= acc_x_k[i + 2]; - - // In our case the diagonal matrix has only 5s on the diagonal, so - // division by 5 gives us its invert. - x_k1 /= 5; - - // Save the value to the output buffer. - acc_x_k1[index] = x_k1; - }); - }); -} - -// Here we compute values which are used for relative error computation -// and copy the vector x_k1 over the vector x_k. -void prepare_for_next_iteration (queue &q, buffer_args &buffers,size_t n) { - constexpr size_t l = 16; - - // To compute the relative error we need to prepare two values: - // abs_error and l1_norm_x_k1. - // - // First, we need to compute the L1-norm of (x_k - x_k1). To do that, - // we sum absolute values of its elements into abs_error buffer. - // We use a reduction algorithm here. - q.submit([&](auto &h) { - accessor acc_abs_error(buffers.abs_error, h, read_write); - accessor acc_x_k(buffers.x_k, h, read_only); - accessor acc_x_k1(buffers.x_k1, h, read_only); - auto r_abs_error = reduction(buffers.abs_error, h, std::plus<>()); - - h.parallel_for(nd_range<1>{n, l}, - r_abs_error, - [=](nd_item<1> index, auto& acc_abs_error) { - auto gid = index.get_global_id(); - - // Compute the sum. - acc_abs_error += abs(acc_x_k[gid] - acc_x_k1[gid]); - }); - }); - - // Second, we need to compute the L1 norm of x_k1. - // For that, we sum absolute values of all elements of vector x_k1 into - // l1_norm_x_k1 buffer. Again we use the reduction algorithm here. - q.submit([&](auto &h) { - accessor acc_x_k1(buffers.x_k1, h, read_only); - accessor acc_l1_norm_x_k1(buffers.l1_norm_x_k1, h, read_write); - auto r_l1_norm_x_k1 = reduction(buffers.l1_norm_x_k1, h, std::plus<>()); - - h.parallel_for(nd_range<1>{n, l}, - r_l1_norm_x_k1, - [=](nd_item<1> index, auto& acc_l1_norm_x_k1) { - auto gid = index.get_global_id(); - // Compute the sum. - acc_l1_norm_x_k1 += abs(acc_x_k1[gid]); // Bug 2 challenge: breakpoint here. - }); - }); - - // We copy the values of vector x_k1 to x_k in order to setup - // the next iteration. - q.submit([&](auto &h) { - accessor acc_x_k(buffers.x_k, h, write_only); - accessor acc_x_k1(buffers.x_k1, h, read_only); - - h.parallel_for(range{n}, [=](id<1> index) { - auto gid = index[0]; - - // Copy the vector x_k1 over x_k. - acc_x_k[gid] = acc_x_k1[gid]; - }); - }); -} - -int main(int argc, char *argv[]) { - // The size of the problem. - // The matrix A is n x n matrix, the length of the vector x is n. - constexpr size_t n = 64; - - // The maximum number of iterations the algorithm is going - // to take. - constexpr size_t max_number_of_iterations = 100; - - // We expect each element of vector x to be that close - // to the analitycal solution. - constexpr float tolerance = 1e-4; - - // The right hand side vector. - float b[n]; - - // We store an intermediate result after every iteration here. - float x_k[n]; - - // At each iteration we compute a new value of x here. We need - // both buffers x_k and x_k1 due to the data dependency: to compute - // one element at the iteration k + 1 we need several elements - // from the iteration k. - float x_k1[n]; - - // We will compute the relative error at each iteration as: - // - // ||x_k - x_k1||_1 abs_error - // rel_error = ------------------- = -------------- - // ||x_k1||_1 l1_norm_x_k1 - - // Absolute error, ||x_k - x_k1||_1, L1-norm of (x_k - x_k1). - float abs_error = 0; - // ||x_k1||_1, L1-norm of x_k1. - float l1_norm_x_k1 = 0; - // Relative error. - float rel_error; - - // Initialize the input. - // Note: the matrix A is hardcoded as a stencil matrix [1 1 5 1 1] - // into the kernel. - initialize_input(b, x_k, n); - - // Iteration counter. - int k = 0; - try { - queue q(default_selector_v, dpc_common::exception_handler); - cout << "[SYCL] Using device: [" - << q.get_device().get_info() << "] from [" - << q.get_device().get_platform().get_info() - << "]\n"; - - // Jacobi iteration begins. - do {// k-th iteration of Jacobi. - // Fix bug 2: we have to reset the error values at each iteration, otherwise - // the relative error accumulates through iterations and does not fall - // below the tolerance. - abs_error = 0; - l1_norm_x_k1 = 0; - - { // Fix bug 3: the host values of abs_error and l1_norm_x_k1 - // were not synchronised with their new values on device. - // Open new scope for buffers. Once the scope is ended, the destructors - // of buffers will write the data back from device to host. - - // Create SYCL buffers. - buffer_args buffers {n, b, x_k, x_k1, &l1_norm_x_k1, &abs_error}; - - main_computation(q, buffers, n); - prepare_for_next_iteration(q, buffers, n); - } - - // Compute relative error based on reduced values from this iteration. - rel_error = abs_error / (l1_norm_x_k1 + 1e-32); - - if (abs_error < 0 || l1_norm_x_k1 < 0 - || (abs_error + l1_norm_x_k1) < 1e-32) { - cout << "\nfail; Bug 3. Fix it on GPU. The relative error has invalid value " - << "after iteration " << k << ".\n" - << "Hint 1: inspect reduced error values. With the challenge scenario\n" - << " from bug 2 you can verify that reduction algorithms compute\n" - << " the correct values inside kernel on GPU. Take into account\n" - << " SIMD lanes: on GPU each thread processes several work items\n" - << " at once, so you need to modify your commands and update\n" - << " the convenience variable for each SIMD lane.\n" - << "Hint 2: why don't we get the correct values at the host part of\n" - << " the application?\n"; - return 0; - } - - // Periodically print out how the algorithm behaves. - if (k % 20 == 0) { - std::cout << "Iteration " << k << ", relative error = " - << rel_error << "\n"; - } - - k++; - } while (rel_error > tolerance && k < max_number_of_iterations); - // Jacobi iteration ends. - - } catch (sycl::exception const &e) { - cout << "fail; synchronous exception occurred: " << e.what() << "\n"; - return -1; - } - - // Verify the output, we expect a vector whose components are close to 1.0. - bool correct = true; - for (int i = 0; i < n; i++) { - if ((x_k[i] - 1.0f) * (x_k[i] - 1.0f) > tolerance) - correct = false; - } - - if (correct) - cout << "\nsuccess; all elements of the resulting vector are close to 1.0.\n"; - else { - cout << "\nfail; Bug 1. Fix this on CPU: components of x_k are not close to 1.0.\n" - << "Hint: figure out which elements are farthest from 1.0.\n"; - return 0; - } - - // Check whether the algorithm converged. - if (k < max_number_of_iterations) { - cout << "success; the relative error (" << rel_error - << ") is below the desired tolerance " - << tolerance <<" after " << k <<" iterations.\n\n"; - } else { - cout << "\nfail; Bug 2. Fix this on CPU: the relative error (" << rel_error - << ") is greater than\n" - << " the desired tolerance " - << tolerance <<" after " << max_number_of_iterations - << " iterations.\n" - << "Hint: check the reduction results at several iterations.\n" - << "Challenge: in the debugger you can simmulate the computation of a reduced\n" - << " value by putting a BP inside the corresponding kernel and defining\n" - << " a convenience variable. We will compute the reduced value at this\n" - << " convenience variable: at each BP hit we update it with a help of \"commands\"\n" - << " command. After the reduction kernel is finished, the convenience\n" - << " variable should contain the reduced value.\n" - << " See README for details.\n"; - return 0; - } - - return 0; -} - -// Note: no bugs here. -void initialize_input(float *b, float *x_k, size_t n) { - constexpr float main_b = 9; - - // Vector b and the matrix A are hardcoded - // such that the analytical solution of the system Ax=b is a vector - // whose elements are 1s. - for (int i = 0; i < n; i++) - b[i] = main_b; - - // Boundary values of the vector b. - b[0] = main_b - 2; - b[1] = main_b - 1; - b[n - 2] = main_b - 1; - b[n - 1] = main_b -2; - - // Initial guess of x is b. - for (int i = 0; i < n; i++) - x_k[i] = b[i]; -} diff --git a/Tools/ApplicationDebugger/jacobi/src/jacobi.cpp b/Tools/ApplicationDebugger/jacobi/src/jacobi.cpp new file mode 100644 index 0000000000..e7b4ce73a5 --- /dev/null +++ b/Tools/ApplicationDebugger/jacobi/src/jacobi.cpp @@ -0,0 +1,168 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +// The program solves the linear equation Ax=b, where matrix A is a +// n x n sparse matrix with diagonals [1 1 5 1 1], +// vector b is set such that the solution is [1 1 ... 1]^T. +// The linear system is solved via Jacobi iteration. +// The algorithm converges, as the matrix A is strictly diagonally dominant. + +#include +#include +#include +// Location of file: /dev-utilities//include +#include "dpc_common.hpp" + +using namespace std; +using namespace sycl; + +// The size of the problem. +// The matrix A is n x n matrix, the length of the vector x is n. +constexpr size_t n = 64; + +// The maximum number of iterations the algorithm is going +// to take. +constexpr size_t max_number_of_iterations = 100; + +// We expect each element of vector x to be that close +// to the analitycal solution. +constexpr float tolerance = 1e-4; + +// Helper structure to initialize and hold all our SYCL buffers. +// Note: no bugs here. +struct buffer_args { + buffer b; + buffer x_k; + buffer x_k1; + buffer l1_norm_x_k1; + buffer abs_error; + buffer_args(float *b, float *x_k, float *x_k1, + float *l1_norm_x_k1, float *abs_error): + // Right hand side vector b; + b(buffer(b, range{n})), + // Iteration vectors x_k and x_k1; + x_k(buffer(x_k, range{n})), + x_k1(buffer(x_k1, range{n})), + // Sum of absolute values of x_k1 elements. + l1_norm_x_k1(buffer(l1_norm_x_k1, range{1})), + // Absolute error. + abs_error(buffer(abs_error, range{1})) + {} +}; + +// Depending on whether FIXED is set, select fixed or bugged versions +// of computation methods. +#ifdef FIXED +#include "fixed.cpp" +#else +#include "bugged.cpp" +#endif // FIXED + +// Initialize right hand side vector b and the initial guess for x_k. +void initialize_input(float *b, float *x_k); + +int main(int argc, char *argv[]) { + // The right hand side vector. + float b[n]; + + // We store an intermediate result after every iteration here. + float x_k[n]; + + // At each iteration we compute a new value of x here. We need + // both buffers x_k and x_k1 due to the data dependency: to compute + // one element at the iteration k + 1 we need several elements + // from the iteration k. + float x_k1[n]; + + // We will compute the relative error at each iteration as: + // + // ||x_k - x_k1||_1 abs_error + // rel_error = ------------------- = -------------- + // ||x_k1||_1 l1_norm_x_k1 + + // Relative error. + float rel_error; + + // Initialize the input. + // Note: the matrix A is hardcoded as a stencil matrix [1 1 5 1 1] + // into the kernel. + initialize_input(b, x_k); + + // Iteration counter. + int k; + try { + queue q(default_selector_v, dpc_common::exception_handler); + cout << "[SYCL] Using device: [" + << q.get_device().get_info() << "] from [" + << q.get_device().get_platform().get_info() + << "]\n"; + + k = iterate (q, b, x_k, x_k1, rel_error); + } catch (sycl::exception const &e) { + cout << "fail; synchronous exception occurred: " << e.what() << "\n"; + return -1; + } + + // Verify the output, we expect a vector whose components are close to 1.0. + bool correct = true; + for (int i = 0; i < n; i++) { + if ((x_k[i] - 1.0f) * (x_k[i] - 1.0f) > tolerance) + correct = false; + } + + if (correct) + cout << "\nsuccess; all elements of the resulting vector are close to 1.0.\n"; + else { + cout << "\nfail; Bug 1. Fix this on CPU: components of x_k are not close to 1.0.\n" + << "Hint: figure out which elements are farthest from 1.0.\n"; + return 0; + } + + // Check whether the algorithm converged. + if (k < max_number_of_iterations) { + cout << "success; the relative error (" << rel_error + << ") is below the desired tolerance " + << tolerance <<" after " << k <<" iterations.\n\n"; + } else { + cout << "\nfail; Bug 2. Fix this on CPU: the relative error (" << rel_error + << ") is greater than\n" + << " the desired tolerance " + << tolerance <<" after " << max_number_of_iterations + << " iterations.\n" + << "Hint: check the reduction results at several iterations.\n" + << "Challenge: in the debugger you can simmulate the computation of a reduced\n" + << " value by putting a BP inside the corresponding kernel and defining\n" + << " a convenience variable. We will compute the reduced value at this\n" + << " convenience variable: at each BP hit we update it with a help of \"commands\"\n" + << " command. After the reduction kernel is finished, the convenience\n" + << " variable should contain the reduced value.\n" + << " See README for details.\n"; + return 0; + } + + return 0; +} + +// Note: no bugs here. +void initialize_input(float *b, float *x_k) { + constexpr float main_b = 9; + + // Vector b and the matrix A are hardcoded + // such that the analytical solution of the system Ax=b is a vector + // whose elements are 1s. + for (int i = 0; i < n; i++) + b[i] = main_b; + + // Boundary values of the vector b. + b[0] = main_b - 2; + b[1] = main_b - 1; + b[n - 2] = main_b - 1; + b[n - 1] = main_b - 2; + + // Initial guess of x is b. + for (int i = 0; i < n; i++) + x_k[i] = b[i]; +}