From 0cd135b4981d08d8d2522704f63f22ef6560dd6e Mon Sep 17 00:00:00 2001 From: "Tu, Peng" Date: Tue, 14 Mar 2023 11:50:30 -0700 Subject: [PATCH] Update GPU-Opt-Guide examples to 2023.03.14 --- .../06_scalars_private/test_scalars.cpp | 6 +- .../test_scalars_private.cpp | 6 +- .../test_scalars_private_2.cpp | 6 +- .../OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp | 8 +- .../23_omp_work_group/test_omp_work_group.cpp | 9 +- .../OpenMP/26_omp_prefetch/CMakeLists.txt | 5 + .../OpenMP/26_omp_prefetch/c/CMakeLists.txt | 1 + .../OpenMP/26_omp_prefetch/c/nbody_c.cpp | 150 ++++++++++++++++ .../26_omp_prefetch/c_simd/CMakeLists.txt | 2 + .../26_omp_prefetch/c_simd/nbody_c_simd.cpp | 169 ++++++++++++++++++ .../26_omp_prefetch/fortran/CMakeLists.txt | 2 + .../OpenMP/26_omp_prefetch/fortran/nbody_f.f | 159 ++++++++++++++++ .../GPU-Opt-Guide/OpenMP/CMakeLists.txt | 1 + Publications/GPU-Opt-Guide/atomics/align.hpp | 5 + .../GPU-Opt-Guide/atomics/atomics.cpp | 18 +- .../atomics/global_atomics_ref.cpp | 7 +- .../atomics/local_atomics_ref.cpp | 17 +- .../GPU-Opt-Guide/atomics/test_atomic.cpp | 28 ++- .../GPU-Opt-Guide/exec-model/vaddsync.cpp | 6 +- .../GPU-Opt-Guide/exec-model/vec-add.cpp | 4 +- .../grf-mode-selection/CMakeLists.txt | 1 - .../implicit-scaling/03_stream/stream.cpp | 14 +- .../04_stream_3D/stream_3D.cpp | 13 +- .../stream_cross_stack.cpp | 95 ++++++++++ .../06_cross_stack/cross_stack_01.cpp | 13 ++ .../libraries-fcorr/fcorr_1d_buffers.cpp | 3 + .../libraries-fcorr/fcorr_1d_usm.cpp | 3 + .../libraries-fcorr/fcorr_1d_usm_fixed.cpp | 3 + .../libraries-stdlib/external_rand.cpp | 4 +- .../libraries-stdlib/rng_test.cpp | 3 + .../local-global-sync/atomics.cpp | 14 +- .../registers/block-load-store.cpp | 2 +- .../histogram256-int-shared-private.cpp | 12 +- .../registers/histogram256-int.cpp | 12 +- .../registers/histogram32-int-volatile.cpp | 12 +- .../registers/histogram32-int.cpp | 12 +- .../registers/histogram32-long.cpp | 12 +- .../GPU-Opt-Guide/slm/histogram-slm-1024.cpp | 12 +- .../GPU-Opt-Guide/slm/histogram-slm-256.cpp | 12 +- .../GPU-Opt-Guide/sub-group/CMakeLists.txt | 1 + .../sub-group/sg-max-size-output.txt | 14 +- .../GPU-Opt-Guide/sub-group/sg-max-size.cpp | 4 +- .../GPU-Opt-Guide/sub-group/sub-group-0.cpp | 2 +- .../GPU-Opt-Guide/sub-group/sub-group-1.cpp | 2 +- .../GPU-Opt-Guide/sub-group/sub-group-3.cpp | 2 +- .../GPU-Opt-Guide/sub-group/sub-group-5.cpp | 2 +- .../GPU-Opt-Guide/sub-group/sub-group-6.cpp | 2 +- .../GPU-Opt-Guide/sub-group/sub-group-7.cpp | 2 +- .../GPU-Opt-Guide/sub-group/sub-group-8.cpp | 49 +++++ .../GPU-Opt-Guide/sub-group/transpose.cpp | 32 ++-- 50 files changed, 819 insertions(+), 154 deletions(-) create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/CMakeLists.txt create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/CMakeLists.txt create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/nbody_c.cpp create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/CMakeLists.txt create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/nbody_c_simd.cpp create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/CMakeLists.txt create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/nbody_f.f create mode 100644 Publications/GPU-Opt-Guide/implicit-scaling/05_stream_cross_stack/stream_cross_stack.cpp create mode 100644 Publications/GPU-Opt-Guide/implicit-scaling/06_cross_stack/cross_stack_01.cpp create mode 100644 Publications/GPU-Opt-Guide/sub-group/sub-group-8.cpp diff --git a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars.cpp b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars.cpp index 9314b38497..bf436533b1 100644 --- a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars.cpp +++ b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars.cpp @@ -18,7 +18,7 @@ #define MAX 100 #define scaled_rand() ((rand() % MAX) / (1.0 * MAX)) -#define IDX2(i, j) (i * P + j) +#define IDX2(i, j) (i * j) #define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k) int main(void) { @@ -43,7 +43,7 @@ int main(void) { dx[i] = scaled_rand(); /* map data to device */ - #pragma omp target enter data map(to : u [0:SIZE], dx [0:P * P]) + #pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P]) start = omp_get_wtime(); @@ -58,7 +58,7 @@ int main(void) { double us = 0.; double ut = 0.; - s1 = dx[IDX4(b, 0, 0, k)]; + s1 = dx[IDX2(b, k)]; s2 = u[IDX4(b, 0, 0, k)] + BLOCKS; s3 = 0.145; diff --git a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private.cpp b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private.cpp index 533be8fd33..db1731b354 100644 --- a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private.cpp +++ b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private.cpp @@ -18,7 +18,7 @@ #define MAX 100 #define scaled_rand() ((rand() % MAX) / (1.0 * MAX)) -#define IDX2(i, j) (i * P + j) +#define IDX2(i, j) (i * j) #define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k) int main(void) { @@ -43,7 +43,7 @@ int main(void) { dx[i] = scaled_rand(); /* map data to device */ - #pragma omp target enter data map(to: u[0:SIZE], dx[0:P * P]) + #pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P]) start = omp_get_wtime(); @@ -58,7 +58,7 @@ int main(void) { double us = 0.; double ut = 0.; - s1 = dx[IDX4(b, 0, 0, k)]; + s1 = dx[IDX2(b, k)]; s2 = u[IDX4(b, 0, 0, k)] + BLOCKS; s3 = 0.145; diff --git a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private_2.cpp b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private_2.cpp index bc412e7756..78f885bc42 100644 --- a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private_2.cpp +++ b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private_2.cpp @@ -18,7 +18,7 @@ #define MAX 100 #define scaled_rand() ((rand() % MAX) / (1.0 * MAX)) -#define IDX2(i, j) (i * P + j) +#define IDX2(i, j) (i * j) #define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k) int main(void) { @@ -42,7 +42,7 @@ int main(void) { dx[i] = scaled_rand(); /* map data to device */ - #pragma omp target enter data map(to: u[0:SIZE], dx[0:P * P]) + #pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P]) start = omp_get_wtime(); @@ -57,7 +57,7 @@ int main(void) { double us = 0.; double ut = 0.; - double s1 = dx[IDX4(b, 0, 0, k)]; + double s1 = dx[IDX2(b, k)]; double s2 = u[IDX4(b, 0, 0, k)] + BLOCKS; double s3 = 0.145; diff --git a/Publications/GPU-Opt-Guide/OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp b/Publications/GPU-Opt-Guide/OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp index 3dcc46e67b..c273ad2cba 100644 --- a/Publications/GPU-Opt-Guide/OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp +++ b/Publications/GPU-Opt-Guide/OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp @@ -147,11 +147,15 @@ int main(int argc, char **argv) { return EXIT_FAILURE; } + FLOAT alpha, beta; + int niter, verify; int HA = atoi(argv[1]); int WA = atoi(argv[2]); int WB = atoi(argv[3]); - FLOAT alpha, beta; - int niter, verify; + + if ((HA == 0) || (WA == 0) || (WB == 0)) + exit(1); + if (argc > 4) { #if PRECISION == 1 diff --git a/Publications/GPU-Opt-Guide/OpenMP/23_omp_work_group/test_omp_work_group.cpp b/Publications/GPU-Opt-Guide/OpenMP/23_omp_work_group/test_omp_work_group.cpp index e8f753bcd3..18fc41df57 100644 --- a/Publications/GPU-Opt-Guide/OpenMP/23_omp_work_group/test_omp_work_group.cpp +++ b/Publications/GPU-Opt-Guide/OpenMP/23_omp_work_group/test_omp_work_group.cpp @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2022 Intel Corporation +// Copyright © 203 Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -9,6 +9,9 @@ double * make_array(int n, double value) { double* array = static_cast(malloc(n * sizeof(double))); + if (array == NULL) + return NULL; + for (int i = 0; i < n; i++) { array[i] = value / (100.0 + i); } @@ -23,6 +26,8 @@ int main() { double* A = make_array(N, 0.8); double* B = make_array(N, 0.65); double* C = make_array(N*N, 2.5); + if ((A == NULL) || (B == NULL) || (C == NULL)) + exit(1); int i, j; double val = 0.0; @@ -38,7 +43,7 @@ int main() { } } - printf("Reduced val[%f10.3]", val); + printf("val = %f10.3\n", val); free(A); free(B); diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/CMakeLists.txt b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/CMakeLists.txt new file mode 100644 index 0000000000..a25b5cb077 --- /dev/null +++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/CMakeLists.txt @@ -0,0 +1,5 @@ +add_compile_options(-g -mcmodel=medium) +add_definitions(-DPREFETCH) +add_subdirectory(c) +add_subdirectory(fortran) +add_subdirectory(c_simd) diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/CMakeLists.txt b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/CMakeLists.txt new file mode 100644 index 0000000000..1d36e7ae22 --- /dev/null +++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/CMakeLists.txt @@ -0,0 +1 @@ +add_example(nbody_c) diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/nbody_c.cpp b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/nbody_c.cpp new file mode 100644 index 0000000000..156a4f451e --- /dev/null +++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/nbody_c.cpp @@ -0,0 +1,150 @@ +#include +#include +#include + +#define CACHE_CLEAN_SIZE 100000000 +#define ITERATIONS 100 +#define ARRAYLEN1 4096 +#define ARRAYLEN2 32768 +// snippet-begin +#define WORKGROUP_SIZE 1024 +#define PREFETCH_HINT 4 // 4 = prefetch to L1 and L3; 2 = prefetch to L3 +#define TILE_SIZE 64 + +void nbody_1d_gpu(float *c, float *a, float *b, int n1, int n2) { +#pragma omp target teams distribute parallel for thread_limit(WORKGROUP_SIZE) + for (int i = 0; i < n1; i++) { + const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f; + const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f; + const float eps = 0.01f; + + float dx = 0.0; + float bb[TILE_SIZE]; + for (int j = 0; j < n2; j += TILE_SIZE) { + // load tile from b + for (int u = 0; u < TILE_SIZE; ++u) { + bb[u] = b[j + u]; +#ifdef PREFETCH + int next_tile = j + TILE_SIZE + u; + if ((next_tile % 16) == 0) { +#pragma ompx prefetch data(PREFETCH_HINT : b[next_tile]) if (next_tile < n2) + } +#endif + } +#pragma unroll(TILE_SIZE) + for (int u = 0; u < TILE_SIZE; ++u) { + float delta = bb[u] - a[i]; + float r2 = delta * delta; + float s0 = r2 + eps; + float s1 = 1.0f / sqrtf(s0); + float f = + (s1 * s1 * s1) - + (ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5))))); + dx += f * delta; + } + } + c[i] = dx * 0.23f; + } +} +// snippet-end + +void nbody_1d_cpu(float *c, float *a, float *b, int n1, int n2) { + for (int i = 0; i < n1; ++i) { + const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f; + const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f; + const float eps = 0.01f; + + float dx = 0.0f; + for (int j = 0; j < n2; ++j) { + float delta = b[j] - a[i]; + float r2 = delta * delta; + float s0 = r2 + eps; + float s1 = 1.0f / sqrtf(s0); + float f = (s1 * s1 * s1) - + (ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5))))); + dx += f * delta; + } + c[i] = dx * 0.23f; + } +} + +void clean_cache_gpu(double *d, int n) { + +#pragma omp target teams distribute parallel for thread_limit(1024) + for (unsigned i = 0; i < n; ++i) + d[i] = i; + + return; +} + +int main() { + + float *a, *b, *c; + double *d; + + a = new float[ARRAYLEN1]; + b = new float[ARRAYLEN2]; + c = new float[ARRAYLEN1]; + d = new double[CACHE_CLEAN_SIZE]; + + // intialize + float dx = 1.0f / (float)ARRAYLEN2; + b[0] = 0.0f; + for (int i = 1; i < ARRAYLEN2; ++i) { + b[i] = b[i - 1] + dx; + } + for (int i = 0; i < ARRAYLEN1; ++i) { + a[i] = b[i]; + c[i] = 0.0f; + } + +#pragma omp target + {} + +#pragma omp target enter data map(alloc \ + : a [0:ARRAYLEN1], b [0:ARRAYLEN2], \ + c [0:ARRAYLEN1]) +#pragma omp target enter data map(alloc : d [0:CACHE_CLEAN_SIZE]) + +#pragma omp target update to(a [0:ARRAYLEN1], b [0:ARRAYLEN2]) + + double t1, t2, elapsed_s = 0.0; + for (int i = 0; i < ITERATIONS; ++i) { + clean_cache_gpu(d, CACHE_CLEAN_SIZE); + + t1 = omp_get_wtime(); + nbody_1d_gpu(c, a, b, ARRAYLEN1, ARRAYLEN2); + t2 = omp_get_wtime(); + + elapsed_s += (t2 - t1); + } + +#pragma omp target update from(c [0:ARRAYLEN1]) + + double sum = 0.0f; + for (int i = 0; i < ARRAYLEN1; ++i) + sum += c[i]; + printf("Obtained output = %8.3f\n", sum); + + for (int i = 0; i < ARRAYLEN1; ++i) + c[i] = 0.0f; + nbody_1d_cpu(c, a, b, ARRAYLEN1, ARRAYLEN2); + sum = 0.0f; + for (int i = 0; i < ARRAYLEN1; ++i) + sum += c[i]; + printf("Expected output = %8.3f\n", sum); + + printf("\nTotal time = %8.1f milliseconds\n", (elapsed_s * 1000)); + +#pragma omp target exit data map(delete \ + : a [0:ARRAYLEN1], b [0:ARRAYLEN2], \ + c [0:ARRAYLEN1]) +#pragma omp target exit data map(delete : d [0:CACHE_CLEAN_SIZE]) + + delete[] a; + delete[] b; + delete[] c; + delete[] d; + + return 0; +} diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/CMakeLists.txt b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/CMakeLists.txt new file mode 100644 index 0000000000..6ec8e627ac --- /dev/null +++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/CMakeLists.txt @@ -0,0 +1,2 @@ +add_compile_options(-fopenmp-target-simd) +add_example(nbody_c_simd) diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/nbody_c_simd.cpp b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/nbody_c_simd.cpp new file mode 100644 index 0000000000..fc6cffffd8 --- /dev/null +++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/nbody_c_simd.cpp @@ -0,0 +1,169 @@ +#include +#include +#include +#include + +#define CACHE_CLEAN_SIZE 100000000 +#define ITERATIONS 100 +#define ARRAYLEN1 4096 +#define ARRAYLEN2 32768 +#define VECLEN 16 +// snippet-begin +#define WORKGROUP_SIZE 1024 +#define PREFETCH_HINT 4 // 4 = prefetch to L1 and L3; 2 = prefetch to L3 +#define TILE_SIZE 64 + +void nbody_1d_gpu(float *c, float *a, float *b, int n1, int n2) { +#pragma omp target teams distribute parallel for thread_limit(WORKGROUP_SIZE / \ + VECLEN) + for (int i = 0; i < n1; i += VECLEN) { + const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f; + const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f; + const float eps = 0.01f; + + float dx[VECLEN]; + float aa[VECLEN], bb[TILE_SIZE]; +#pragma omp simd simdlen(VECLEN) +#pragma unroll(0) + for (int v = 0; v < VECLEN; ++v) { + dx[v] = 0.0f; + aa[v] = a[i + v]; + } + for (int j = 0; j < n2; j += TILE_SIZE) { + // load tile from b + for (int u = 0; u < TILE_SIZE; u += VECLEN) { +#pragma omp simd simdlen(VECLEN) +#pragma unroll(0) + for (int v = 0; v < VECLEN; ++v) + bb[u + v] = b[j + u + v]; +#ifdef PREFETCH + int next_tile = j + TILE_SIZE + u; +#pragma ompx prefetch data(PREFETCH_HINT : b[next_tile]) if (next_tile < n2) +#endif + } +// compute current tile +#pragma omp simd simdlen(VECLEN) +#pragma unroll(0) + for (int v = 0; v < VECLEN; ++v) { +#pragma unroll(TILE_SIZE) + for (int u = 0; u < TILE_SIZE; ++u) { + float delta = bb[u] - aa[v]; + float r2 = delta * delta; + float s0 = r2 + eps; + float s1 = 1.0f / sqrtf(s0); + float f = + (s1 * s1 * s1) - + (ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5))))); + dx[v] += f * delta; + } + } + } +#pragma omp simd simdlen(VECLEN) +#pragma unroll(0) + for (int v = 0; v < VECLEN; ++v) { + c[i + v] = dx[v] * 0.23f; + } + } +} +// snippet-end + +void nbody_1d_cpu(float *c, float *a, float *b, int n1, int n2) { + for (int i = 0; i < n1; ++i) { + const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f; + const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f; + const float eps = 0.01f; + + float dx = 0.0f; + for (int j = 0; j < n2; ++j) { + float delta = b[j] - a[i]; + float r2 = delta * delta; + float s0 = r2 + eps; + float s1 = 1.0f / sqrtf(s0); + float f = (s1 * s1 * s1) - + (ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5))))); + dx += f * delta; + } + c[i] = dx * 0.23f; + } +} + +void clean_cache_gpu(double *d, int n) { + +#pragma omp target teams distribute parallel for thread_limit(1024) + for (unsigned i = 0; i < n; ++i) + d[i] = i; + + return; +} + +int main() { + + float *a, *b, *c; + double *d; + + a = new float[ARRAYLEN1]; + b = new float[ARRAYLEN2]; + c = new float[ARRAYLEN1]; + d = new double[CACHE_CLEAN_SIZE]; + + // intialize + float dx = 1.0f / (float)ARRAYLEN2; + b[0] = 0.0f; + for (int i = 1; i < ARRAYLEN2; ++i) { + b[i] = b[i - 1] + dx; + } + for (int i = 0; i < ARRAYLEN1; ++i) { + a[i] = b[i]; + c[i] = 0.0f; + } + +#pragma omp target + {} + +#pragma omp target enter data map(alloc \ + : a [0:ARRAYLEN1], b [0:ARRAYLEN2], \ + c [0:ARRAYLEN1]) +#pragma omp target enter data map(alloc : d [0:CACHE_CLEAN_SIZE]) + +#pragma omp target update to(a [0:ARRAYLEN1], b [0:ARRAYLEN2]) + + double t1, t2, elapsed_s = 0.0; + for (int i = 0; i < ITERATIONS; ++i) { + clean_cache_gpu(d, CACHE_CLEAN_SIZE); + + t1 = omp_get_wtime(); + nbody_1d_gpu(c, a, b, ARRAYLEN1, ARRAYLEN2); + t2 = omp_get_wtime(); + + elapsed_s += (t2 - t1); + } + +#pragma omp target update from(c [0:ARRAYLEN1]) + + float sum = 0.0f; + for (int i = 0; i < ARRAYLEN1; ++i) + sum += c[i]; + printf("Obtained output = %8.3f\n", sum); + + for (int i = 0; i < ARRAYLEN1; ++i) + c[i] = 0.0f; + nbody_1d_cpu(c, a, b, ARRAYLEN1, ARRAYLEN2); + sum = 0.0f; + for (int i = 0; i < ARRAYLEN1; ++i) + sum += c[i]; + printf("Expected output = %8.3f\n", sum); + + printf("\nTotal time = %8.1f milliseconds\n", (elapsed_s * 1000)); + +#pragma omp target exit data map(delete \ + : a [0:ARRAYLEN1], b [0:ARRAYLEN2], \ + c [0:ARRAYLEN1]) +#pragma omp target exit data map(delete : d [0:CACHE_CLEAN_SIZE]) + + delete[] a; + delete[] b; + delete[] c; + delete[] d; + + return 0; +} diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/CMakeLists.txt b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/CMakeLists.txt new file mode 100644 index 0000000000..e9106a7dda --- /dev/null +++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/CMakeLists.txt @@ -0,0 +1,2 @@ +add_compile_options(-fpconstant -fpp -ffast-math -fno-sycl-instrument-device-code) +add_fortran_example_with_mkl(nbody_f) diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/nbody_f.f b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/nbody_f.f new file mode 100644 index 0000000000..a893aa7d8c --- /dev/null +++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/nbody_f.f @@ -0,0 +1,159 @@ +#define CACHE_CLEAN_SIZE 100000000 +#define ITERATIONS 100 +#define ARRAYLEN1 4096 +#define ARRAYLEN2 32768 +c snippet-begin +#define WORKGROUP_SIZE 1024 +#define PREFETCH_HINT 4 ! 4 = prefetch to L1 and L3; 2 = prefetch to L3 +#define TILE_SIZE 64 + + subroutine nbody_1d_gpu(c, a, b, n1, n2) + implicit none + integer n1, n2 + real a(0:n1-1), b(0:n2-1), c(0:n1-1) + real dx, bb(0:TILE_SIZE-1), delta, r2, s0, s1, f + integer i,j,u,next + real ma0, ma1, ma2, ma3, ma4, ma5, eps + parameter (ma0=0.269327, ma1=-0.0750978, ma2=0.0114808) + parameter (ma3=-0.00109313, ma4=0.0000605491, ma5=-0.00000147177) + parameter (eps=0.01) + +!$omp target teams distribute parallel do thread_limit(WORKGROUP_SIZE) +!$omp& private(i,dx,j,u,bb,next,delta,r2,s0,s1,f) + do i = 0, n1-1 + dx = 0.0 + do j = 0, n2-1, TILE_SIZE + ! load tile from b + do u = 0, TILE_SIZE-1 + bb(u) = b(j+u) +#ifdef PREFETCH + next = j + TILE_SIZE + u + if (mod(next,16).eq.0) then +!$omp prefetch data(PREFETCH_HINT:b(next:next))if(next +#include +#include +#include + enum class Alignment : size_t { Normal = sizeof(void *), SSE = 16, diff --git a/Publications/GPU-Opt-Guide/atomics/atomics.cpp b/Publications/GPU-Opt-Guide/atomics/atomics.cpp index f3a2c6858d..3e33a4eab6 100644 --- a/Publications/GPU-Opt-Guide/atomics/atomics.cpp +++ b/Publications/GPU-Opt-Guide/atomics/atomics.cpp @@ -134,10 +134,10 @@ int reductionInt(sycl::queue &q, std::vector &data, h.parallel_for(data_size, [=](auto index) { size_t glob_id = index[0]; - auto v = sycl::ext::oneapi::atomic_ref< - int, sycl::ext::oneapi::memory_order::relaxed, - sycl::ext::oneapi::memory_scope::device, - sycl::access::address_space::global_space>(sum_acc[0]); + auto v = sycl::atomic_ref( + sum_acc[0]); v.fetch_add(buf_acc[glob_id]); }); }); @@ -200,10 +200,10 @@ int reductionFloat(sycl::queue &q, std::vector &data, h.parallel_for(data_size, [=](auto index) { size_t glob_id = index[0]; - auto v = sycl::ext::oneapi::atomic_ref< - float, sycl::ext::oneapi::memory_order::relaxed, - sycl::ext::oneapi::memory_scope::device, - sycl::access::address_space::global_space>(sum_acc[0]); + auto v = sycl::atomic_ref( + sum_acc[0]); v.fetch_add(buf_acc[glob_id]); }); }); @@ -225,7 +225,7 @@ int reductionFloat(sycl::queue &q, std::vector &data, int main(int argc, char *argv[]) { - sycl::queue q{sycl::default_selector{}, exception_handler}; + sycl::queue q{sycl::default_selector_v, exception_handler}; std::cout << q.get_device().get_info() << "\n"; { std::vector data(N, 1); diff --git a/Publications/GPU-Opt-Guide/atomics/global_atomics_ref.cpp b/Publications/GPU-Opt-Guide/atomics/global_atomics_ref.cpp index d4490b4ff3..84c135d312 100644 --- a/Publications/GPU-Opt-Guide/atomics/global_atomics_ref.cpp +++ b/Publications/GPU-Opt-Guide/atomics/global_atomics_ref.cpp @@ -20,10 +20,9 @@ int main() { sycl::accessor acc_a(bufa, h, sycl::read_only); h.parallel_for(sycl::nd_range<1>(N, M), [=](auto it) { auto i = it.get_global_id(); - sycl::ext::oneapi::atomic_ref + sycl::atomic_ref atomic_op(acc[0]); atomic_op += acc_a[i]; }); diff --git a/Publications/GPU-Opt-Guide/atomics/local_atomics_ref.cpp b/Publications/GPU-Opt-Guide/atomics/local_atomics_ref.cpp index bffe098da6..a3b674f887 100644 --- a/Publications/GPU-Opt-Guide/atomics/local_atomics_ref.cpp +++ b/Publications/GPU-Opt-Guide/atomics/local_atomics_ref.cpp @@ -19,20 +19,17 @@ int main() { auto e1 = q.submit([&](sycl::handler &h) { sycl::accessor b(global, h); sycl::accessor acc_a(bufa, h, sycl::read_only); - auto acc = sycl::accessor(NUM_WG, h); + auto acc = sycl::local_accessor(NUM_WG, h); h.parallel_for(sycl::nd_range<1>(N, M), [=](auto it) { auto i = it.get_global_id(0); auto group_id = it.get_group(0); - sycl::ext::oneapi::atomic_ref + sycl::atomic_ref atomic_op(acc[group_id]); - sycl::ext::oneapi::atomic_ref + sycl::atomic_ref atomic_op_global(b[0]); atomic_op += acc_a[i]; it.barrier(sycl::access::fence_space::local_space); diff --git a/Publications/GPU-Opt-Guide/atomics/test_atomic.cpp b/Publications/GPU-Opt-Guide/atomics/test_atomic.cpp index dd380abe53..f1ef096630 100644 --- a/Publications/GPU-Opt-Guide/atomics/test_atomic.cpp +++ b/Publications/GPU-Opt-Guide/atomics/test_atomic.cpp @@ -9,8 +9,6 @@ #include #include -sycl::default_selector d_selector; - template using VectorAllocator = AlignedAllocator; template using AlignedVector = std::vector>; @@ -47,10 +45,10 @@ int VectorInt(sycl::queue &q, int iter) { sycl::accessor b_acc(a_buf, h, sycl::read_only); h.parallel_for(num_items, [=](auto i) { - auto v = sycl::ext::oneapi::atomic_ref< - int, sycl::ext::oneapi::memory_order::relaxed, - sycl::ext::oneapi::memory_scope::device, - sycl::access::address_space::global_space>(a_acc[0]); + auto v = sycl::atomic_ref( + a_acc[0]); v += b_acc[i]; }); }); @@ -85,10 +83,10 @@ int VectorFloat(sycl::queue &q, int iter) { sycl::accessor b_acc(a_buf, h, sycl::read_only); h.parallel_for(num_items, [=](auto i) { - auto v = sycl::ext::oneapi::atomic_ref< - float, sycl::ext::oneapi::memory_order::relaxed, - sycl::ext::oneapi::memory_scope::device, - sycl::access::address_space::global_space>(a_acc[0]); + auto v = sycl::atomic_ref( + a_acc[0]); v += b_acc[i]; }); }); @@ -122,10 +120,10 @@ int VectorDouble(sycl::queue &q, int iter) { sycl::accessor b_acc(a_buf, h, sycl::read_only); h.parallel_for(num_items, [=](auto i) { - auto v = sycl::ext::oneapi::atomic_ref< - double, sycl::ext::oneapi::memory_order::relaxed, - sycl::ext::oneapi::memory_scope::device, - sycl::access::address_space::global_space>(a_acc[0]); + auto v = sycl::atomic_ref( + a_acc[0]); v += b_acc[i]; }); }); @@ -140,7 +138,7 @@ int VectorDouble(sycl::queue &q, int iter) { int main() { - sycl::queue q(d_selector); + sycl::queue q(sycl::gpu_selector_v); VectorAllocator alloc; AlignedVector a(array_size, alloc); AlignedVector b(array_size, alloc); diff --git a/Publications/GPU-Opt-Guide/exec-model/vaddsync.cpp b/Publications/GPU-Opt-Guide/exec-model/vaddsync.cpp index 4579d11c63..012b13b8fe 100644 --- a/Publications/GPU-Opt-Guide/exec-model/vaddsync.cpp +++ b/Publications/GPU-Opt-Guide/exec-model/vaddsync.cpp @@ -8,7 +8,7 @@ #include #include -sycl::default_selector d_selector; +auto d_selector = sycl::default_selector_v; // Array type and data size for this example. constexpr size_t array_size = 3 * 5 * 7 * (1 << 18); @@ -41,7 +41,7 @@ int VectorAdd3(sycl::queue &q, const IntArray &a, const IntArray &b, size_t loc_id = index.get_local_id(); size_t start = grp_id * mysize; size_t end = start + mysize; - for (int i = 0; i < iter; i++) + for (int j = 0; j < iter; j++) for (size_t i = start + loc_id; i < end; i += wg_size) { sum_acc[i] = a_acc[i] + b_acc[i]; } @@ -80,7 +80,7 @@ int VectorAdd4(sycl::queue &q, const IntArray &a, const IntArray &b, size_t loc_id = index.get_local_id(); size_t start = grp_id * mysize; size_t end = start + mysize; - for (int i = 0; i < iter; i++) { + for (int j = 0; j < iter; j++) { for (size_t i = start + loc_id; i < end; i += wg_size) { sum_acc[i] = a_acc[i] + b_acc[i]; } diff --git a/Publications/GPU-Opt-Guide/exec-model/vec-add.cpp b/Publications/GPU-Opt-Guide/exec-model/vec-add.cpp index ef558c3d01..2c8957f0f5 100644 --- a/Publications/GPU-Opt-Guide/exec-model/vec-add.cpp +++ b/Publications/GPU-Opt-Guide/exec-model/vec-add.cpp @@ -8,7 +8,7 @@ #include #include -sycl::default_selector d_selector; +auto d_selector = sycl::default_selector_v; // Array type and data size for this example. constexpr size_t array_size = 3 * 5 * 7 * (1 << 17); @@ -70,7 +70,7 @@ int VectorAdd2(sycl::queue &q, const IntArray &a, const IntArray &b, size_t loc_id = index.get_local_id(); size_t start = grp_id * mysize; size_t end = start + mysize; - for (int i = 0; i < iter; i++) + for (int j = 0; j < iter; j++) for (size_t i = start + loc_id; i < end; i += wg_size) { sum_acc[i] = a_acc[i] + b_acc[i]; } diff --git a/Publications/GPU-Opt-Guide/grf-mode-selection/CMakeLists.txt b/Publications/GPU-Opt-Guide/grf-mode-selection/CMakeLists.txt index 9647261757..d776d83c6b 100644 --- a/Publications/GPU-Opt-Guide/grf-mode-selection/CMakeLists.txt +++ b/Publications/GPU-Opt-Guide/grf-mode-selection/CMakeLists.txt @@ -1,3 +1,2 @@ -add_subdirectory(openmp) #add_subdirectory(sycl) add_subdirectory(perf) diff --git a/Publications/GPU-Opt-Guide/implicit-scaling/03_stream/stream.cpp b/Publications/GPU-Opt-Guide/implicit-scaling/03_stream/stream.cpp index d6fa01ee7c..3d6cf6bdbf 100644 --- a/Publications/GPU-Opt-Guide/implicit-scaling/03_stream/stream.cpp +++ b/Publications/GPU-Opt-Guide/implicit-scaling/03_stream/stream.cpp @@ -7,6 +7,7 @@ // Code for STREAM: #include #include +#include // compile via: // icpx -O2 -fiopenmp -fopenmp-targets=spir64 ./stream.cpp @@ -14,11 +15,11 @@ int main() { constexpr int64_t N = 256 * 1e6; - constexpr int64_t bytes = N * sizeof(int); + constexpr int64_t bytes = N * sizeof(int64_t); - int *a = static_cast(malloc(bytes)); - int *b = static_cast(malloc(bytes)); - int *c = static_cast(malloc(bytes)); + int64_t *a = static_cast(malloc(bytes)); + int64_t *b = static_cast(malloc(bytes)); + int64_t *c = static_cast(malloc(bytes)); #pragma omp target enter data map(alloc:a[0:N]) #pragma omp target enter data map(alloc:b[0:N]) @@ -40,8 +41,7 @@ int main() if (irep == 10) time = omp_get_wtime(); - #pragma omp target teams distribute parallel for \ - simd simdlen(32) thread_limit(256) + #pragma omp target teams distribute parallel for simd for (int i = 0; i < N; ++i) { c[i] = a[i] + b[i]; @@ -61,7 +61,7 @@ int main() } } - const int64_t streamed_bytes = 3 * N * sizeof(int); + const int64_t streamed_bytes = 3 * N * sizeof(int64_t); std::cout << "bandwidth = " << (streamed_bytes / time) * 1E-9 << " GB/s" << std::endl; diff --git a/Publications/GPU-Opt-Guide/implicit-scaling/04_stream_3D/stream_3D.cpp b/Publications/GPU-Opt-Guide/implicit-scaling/04_stream_3D/stream_3D.cpp index d68875457c..d4b07d3d1e 100644 --- a/Publications/GPU-Opt-Guide/implicit-scaling/04_stream_3D/stream_3D.cpp +++ b/Publications/GPU-Opt-Guide/implicit-scaling/04_stream_3D/stream_3D.cpp @@ -16,14 +16,14 @@ int main(int argc, char **argv) { const int device_id = omp_get_default_device(); const int desired_total_size = 32 * 512 * 16384; - const std::size_t bytes = desired_total_size * sizeof(int); + const std::size_t bytes = desired_total_size * sizeof(int64_t); std::cout << "memory footprint = " << 3 * bytes * 1E-9 << " GB" << std::endl; - int *a = static_cast(omp_target_alloc_device(bytes, device_id)); - int *b = static_cast(omp_target_alloc_device(bytes, device_id)); - int *c = static_cast(omp_target_alloc_device(bytes, device_id)); + int64_t *a = static_cast(omp_target_alloc_device(bytes, device_id)); + int64_t *b = static_cast(omp_target_alloc_device(bytes, device_id)); + int64_t *c = static_cast(omp_target_alloc_device(bytes, device_id)); const int min = 64; const int max = 32768; @@ -56,8 +56,7 @@ int main(int argc, char **argv) { if (irep == warmup) time = omp_get_wtime(); - #pragma omp target teams distribute parallel for \ - simd simdlen(32) thread_limit(64) collapse(3) + #pragma omp target teams distribute parallel for simd collapse(3) for (int iz = 0; iz < lz; ++iz) { for (int iy = 0; iy < ly; ++iy) @@ -73,7 +72,7 @@ int main(int argc, char **argv) time = omp_get_wtime() - time; time = time / no_max_rep; - const int64_t streamed_bytes = 3 * total_size * sizeof(int); + const int64_t streamed_bytes = 3 * total_size * sizeof(int64_t); std::cout << "bandwidth = " << (streamed_bytes / time) * 1E-9 << " GB/s" << std::endl; diff --git a/Publications/GPU-Opt-Guide/implicit-scaling/05_stream_cross_stack/stream_cross_stack.cpp b/Publications/GPU-Opt-Guide/implicit-scaling/05_stream_cross_stack/stream_cross_stack.cpp new file mode 100644 index 0000000000..834a22397c --- /dev/null +++ b/Publications/GPU-Opt-Guide/implicit-scaling/05_stream_cross_stack/stream_cross_stack.cpp @@ -0,0 +1,95 @@ +//============================================================== +// Copyright © 2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// clang-format off +// Code for cross stack stream +#include +#include + +// compile via: +// icpx -O2 -fiopenmp -fopenmp-targets=spir64 ./stream_cross_stack.cpp +// run via: +// EnableWalkerPartition=1 ZE_AFFINITY_MASK=0 ./a.out + +template +void cross_stack_stream() { + + constexpr int64_t size = 256*1e6; + constexpr int64_t bytes = size * sizeof(int64_t); + + int64_t *a = static_cast(malloc( bytes )); + int64_t *b = static_cast(malloc( bytes )); + int64_t *c = static_cast(malloc( bytes )); + #pragma omp target enter data map( alloc:a[0:size] ) + #pragma omp target enter data map( alloc:b[0:size] ) + #pragma omp target enter data map( alloc:c[0:size] ) + + for ( int i = 0; i < size; ++i ) { + + a[i] = i + 1; + b[i] = i - 1; + c[i] = 0; + } + + #pragma omp target update to( a[0:size] ) + #pragma omp target update to( b[0:size] ) + #pragma omp target update to( c[0:size] ) + + const int num_max_rep = 100; + + double time; + + for ( int irep = 0; irep < num_max_rep+10; ++irep ) { + + if ( irep == 10 ) time = omp_get_wtime(); + + #pragma omp target teams distribute parallel for simd + for ( int j = 0; j < size; ++j ) { + + const int cache_line_id = j / 16; + + int i; + + if ( (cache_line_id%cross_stack_fraction) == 0 ) { + + i = (j+size/2)%size; + } + else { + + i = j; + } + + c[i] = a[i] + b[i]; + } + } + time = omp_get_wtime() - time; + time = time/num_max_rep; + + #pragma omp target update from( c[0:size] ) + + for ( int i = 0; i < size; ++i ) { + + if ( c[i] != 2*i ) { + + std::cout << "wrong results!" << std::endl; + exit(1); + } + } + + const int64_t streamed_bytes = 3 * size * sizeof(int64_t); + + std::cout << "cross_stack_percent = " << (1/(double)cross_stack_fraction)*100 + << "%, bandwidth = " << (streamed_bytes/time) * 1E-9 << " GB/s" << std::endl; +} + +int main() { + + cross_stack_stream< 1>(); + cross_stack_stream< 2>(); + cross_stack_stream< 4>(); + cross_stack_stream< 8>(); + cross_stack_stream<16>(); + cross_stack_stream<32>(); +} diff --git a/Publications/GPU-Opt-Guide/implicit-scaling/06_cross_stack/cross_stack_01.cpp b/Publications/GPU-Opt-Guide/implicit-scaling/06_cross_stack/cross_stack_01.cpp new file mode 100644 index 0000000000..270bb9eb4f --- /dev/null +++ b/Publications/GPU-Opt-Guide/implicit-scaling/06_cross_stack/cross_stack_01.cpp @@ -0,0 +1,13 @@ +//============================================================== +// Copyright © 2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// clang-format off +// Snippet begin +#pragma omp target teams distribute parallel for simd +for (int i = N - 1; i <= 0; --i) +{ + c[i] = a[i] + b[i]; +} +// Snippet end diff --git a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_buffers.cpp b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_buffers.cpp index 3f8ce96b18..b484930f32 100644 --- a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_buffers.cpp +++ b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_buffers.cpp @@ -3,6 +3,8 @@ // // SPDX-License-Identifier: MIT // ============================================================= + +// Snippet begin #include #include #include @@ -108,3 +110,4 @@ int main(int argc, char **argv) { "normalized correlation score of " << max_corr / N << "." << std::endl; } +// Snippet end diff --git a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm.cpp b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm.cpp index 39f2314521..541864e085 100644 --- a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm.cpp +++ b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm.cpp @@ -3,6 +3,8 @@ // // SPDX-License-Identifier: MIT // ============================================================= + +// Snippet begin #include #include #include @@ -102,3 +104,4 @@ int main(int argc, char **argv) { sycl::free(sig2, sycl_context); sycl::free(corr, sycl_context); } +// Snippet end diff --git a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm_fixed.cpp b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm_fixed.cpp index 0d59a276ee..ee72cd2cfd 100644 --- a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm_fixed.cpp +++ b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm_fixed.cpp @@ -3,6 +3,8 @@ // // SPDX-License-Identifier: MIT // ============================================================= + +// Snippet begin #include #include #include @@ -101,3 +103,4 @@ int main(int argc, char **argv) { sycl::free(sig2, sycl_context); sycl::free(corr, sycl_context); } +// Snippet end diff --git a/Publications/GPU-Opt-Guide/libraries-stdlib/external_rand.cpp b/Publications/GPU-Opt-Guide/libraries-stdlib/external_rand.cpp index e90048b22c..910eeadd12 100644 --- a/Publications/GPU-Opt-Guide/libraries-stdlib/external_rand.cpp +++ b/Publications/GPU-Opt-Guide/libraries-stdlib/external_rand.cpp @@ -6,11 +6,12 @@ // Compile: // dpcpp -D{HOST|CPU|GPU} -std=c++17 -fsycl external_rand.cpp -o external_rand +// Snippet begin #include #include #include -#define N 5 +constexpr int N = 5; extern SYCL_EXTERNAL int rand(void); @@ -41,3 +42,4 @@ int main(void) { // Cleanup sycl::free(test1, Q.get_context()); } +// Snippet end diff --git a/Publications/GPU-Opt-Guide/libraries-stdlib/rng_test.cpp b/Publications/GPU-Opt-Guide/libraries-stdlib/rng_test.cpp index a26c04b911..e1158356d7 100644 --- a/Publications/GPU-Opt-Guide/libraries-stdlib/rng_test.cpp +++ b/Publications/GPU-Opt-Guide/libraries-stdlib/rng_test.cpp @@ -3,6 +3,8 @@ // // SPDX-License-Identifier: MIT // ============================================================= + +// Snippet begin #include #include #include @@ -71,3 +73,4 @@ int main(int argc, char **argv) { sycl::free(test1, Q.get_context()); sycl::free(test2, Q.get_context()); } +// Snippet end diff --git a/Publications/GPU-Opt-Guide/local-global-sync/atomics.cpp b/Publications/GPU-Opt-Guide/local-global-sync/atomics.cpp index 14a9407334..fb8055f656 100644 --- a/Publications/GPU-Opt-Guide/local-global-sync/atomics.cpp +++ b/Publications/GPU-Opt-Guide/local-global-sync/atomics.cpp @@ -129,9 +129,7 @@ int reductionIntBarrier(sycl::queue &q, std::vector &data, int iter) { q.submit([&](auto &h) { sycl::accessor buf_acc(buf, h, sycl::read_only); sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init); - sycl::accessor - scratch(work_group_size, h); + sycl::local_accessor scratch(work_group_size, h); h.parallel_for(sycl::nd_range<1>{work_group_size, work_group_size}, [=](sycl::nd_item<1> item) { size_t loc_id = item.get_local_id(0); @@ -194,10 +192,10 @@ int reductionIntAtomic(sycl::queue &q, std::vector &data, int iter) { h.parallel_for(data_size, [=](auto index) { size_t glob_id = index[0]; - auto v = sycl::ext::oneapi::atomic_ref< - int, sycl::ext::oneapi::memory_order::relaxed, - sycl::ext::oneapi::memory_scope::device, - sycl::access::address_space::global_space>(sum_acc[0]); + auto v = sycl::atomic_ref( + sum_acc[0]); v.fetch_add(buf_acc[glob_id]); }); }); @@ -219,7 +217,7 @@ int reductionIntAtomic(sycl::queue &q, std::vector &data, int iter) { int main(int argc, char *argv[]) { - sycl::queue q{sycl::default_selector{}, exception_handler}; + sycl::queue q{sycl::default_selector_v, exception_handler}; std::cout << q.get_device().get_info() << "\n"; std::vector data(N, 1); reductionIntSerial(q, data, 1000); diff --git a/Publications/GPU-Opt-Guide/registers/block-load-store.cpp b/Publications/GPU-Opt-Guide/registers/block-load-store.cpp index c74c642ff1..7e07895b9d 100644 --- a/Publications/GPU-Opt-Guide/registers/block-load-store.cpp +++ b/Publications/GPU-Opt-Guide/registers/block-load-store.cpp @@ -22,7 +22,7 @@ int main() { h.parallel_for( sycl::nd_range(sycl::range{N}, sycl::range{32}), [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] { - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int x; using global_ptr = diff --git a/Publications/GPU-Opt-Guide/registers/histogram256-int-shared-private.cpp b/Publications/GPU-Opt-Guide/registers/histogram256-int-shared-private.cpp index 77e15211cf..8004370b36 100644 --- a/Publications/GPU-Opt-Guide/registers/histogram256-int-shared-private.cpp +++ b/Publications/GPU-Opt-Guide/registers/histogram256-int-shared-private.cpp @@ -30,7 +30,7 @@ int main() { << "\n"; // Snippet begin - constexpr int blockSize = 256; + constexpr int BLOCK_SIZE = 256; constexpr int NUM_BINS = 256; std::vector hist(NUM_BINS, 0); @@ -42,11 +42,11 @@ int main() { sycl::accessor macc(mbuf, h, sycl::read_only); auto hacc = hbuf.get_access(h); h.parallel_for( - sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}), + sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}), [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] { int group = it.get_group()[0]; int gSize = it.get_local_range()[0]; - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; int sgGroup = sg.get_group_id()[0]; @@ -56,10 +56,10 @@ int main() { for (int k = 0; k < NUM_BINS / 16; k++) { histogram[k] = 0; } - for (int k = 0; k < blockSize; k++) { + for (int k = 0; k < BLOCK_SIZE; k++) { unsigned long x = - sg.load(macc.get_pointer() + group * gSize * blockSize + - sgGroup * sgSize * blockSize + sgSize * k); + sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE + + sgGroup * sgSize * BLOCK_SIZE + sgSize * k); // subgroup size is 16 #pragma unroll for (int j = 0; j < 16; j++) { diff --git a/Publications/GPU-Opt-Guide/registers/histogram256-int.cpp b/Publications/GPU-Opt-Guide/registers/histogram256-int.cpp index e3532413c2..9ab6f31368 100644 --- a/Publications/GPU-Opt-Guide/registers/histogram256-int.cpp +++ b/Publications/GPU-Opt-Guide/registers/histogram256-int.cpp @@ -30,7 +30,7 @@ int main() { << "\n"; // Snippet begin - constexpr int blockSize = 256; + constexpr int BLOCK_SIZE = 256; constexpr int NUM_BINS = 256; std::vector hist(NUM_BINS, 0); @@ -42,11 +42,11 @@ int main() { sycl::accessor macc(mbuf, h, sycl::read_only); auto hacc = hbuf.get_access(h); h.parallel_for( - sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}), + sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}), [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] { int group = it.get_group()[0]; int gSize = it.get_local_range()[0]; - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; int sgGroup = sg.get_group_id()[0]; @@ -56,10 +56,10 @@ int main() { for (int k = 0; k < NUM_BINS; k++) { histogram[k] = 0; } - for (int k = 0; k < blockSize; k++) { + for (int k = 0; k < BLOCK_SIZE; k++) { unsigned long x = - sg.load(macc.get_pointer() + group * gSize * blockSize + - sgGroup * sgSize * blockSize + sgSize * k); + sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE + + sgGroup * sgSize * BLOCK_SIZE + sgSize * k); #pragma unroll for (int i = 0; i < 8; i++) { unsigned int c = x & 0x1FU; diff --git a/Publications/GPU-Opt-Guide/registers/histogram32-int-volatile.cpp b/Publications/GPU-Opt-Guide/registers/histogram32-int-volatile.cpp index 162ac59d59..486982e455 100644 --- a/Publications/GPU-Opt-Guide/registers/histogram32-int-volatile.cpp +++ b/Publications/GPU-Opt-Guide/registers/histogram32-int-volatile.cpp @@ -30,7 +30,7 @@ int main() { << "\n"; // Snippet begin - constexpr int blockSize = 256; + constexpr int BLOCK_SIZE = 256; constexpr int NUM_BINS = 32; std::vector hist(NUM_BINS, 0); @@ -41,11 +41,11 @@ int main() { auto e = q.submit([&](auto &h) { sycl::accessor macc(mbuf, h, sycl::read_only); auto hacc = hbuf.get_access(h); - h.parallel_for(sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}), + h.parallel_for(sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}), [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] { int group = it.get_group()[0]; int gSize = it.get_local_range()[0]; - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; int sgGroup = sg.get_group_id()[0]; @@ -56,10 +56,10 @@ int main() { for (int k = 0; k < NUM_BINS; k++) { histogram[k] = 0; } - for (int k = 0; k < blockSize; k++) { + for (int k = 0; k < BLOCK_SIZE; k++) { unsigned long x = sg.load( - macc.get_pointer() + group * gSize * blockSize + - sgGroup * sgSize * blockSize + sgSize * k); + macc.get_pointer() + group * gSize * BLOCK_SIZE + + sgGroup * sgSize * BLOCK_SIZE + sgSize * k); #pragma unroll for (int i = 0; i < 8; i++) { unsigned int c = x & 0x1FU; diff --git a/Publications/GPU-Opt-Guide/registers/histogram32-int.cpp b/Publications/GPU-Opt-Guide/registers/histogram32-int.cpp index 0c72b211c8..8312d0e95c 100644 --- a/Publications/GPU-Opt-Guide/registers/histogram32-int.cpp +++ b/Publications/GPU-Opt-Guide/registers/histogram32-int.cpp @@ -30,7 +30,7 @@ int main() { << "\n"; // Snippet begin - constexpr int blockSize = 256; + constexpr int BLOCK_SIZE = 256; constexpr int NUM_BINS = 32; std::vector hist(NUM_BINS, 0); @@ -42,11 +42,11 @@ int main() { sycl::accessor macc(mbuf, h, sycl::read_only); auto hacc = hbuf.get_access(h); h.parallel_for( - sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}), + sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}), [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] { int group = it.get_group()[0]; int gSize = it.get_local_range()[0]; - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; int sgGroup = sg.get_group_id()[0]; @@ -55,10 +55,10 @@ int main() { for (int k = 0; k < NUM_BINS; k++) { histogram[k] = 0; } - for (int k = 0; k < blockSize; k++) { + for (int k = 0; k < BLOCK_SIZE; k++) { unsigned long x = - sg.load(macc.get_pointer() + group * gSize * blockSize + - sgGroup * sgSize * blockSize + sgSize * k); + sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE + + sgGroup * sgSize * BLOCK_SIZE + sgSize * k); #pragma unroll for (int i = 0; i < 8; i++) { unsigned int c = x & 0x1FU; diff --git a/Publications/GPU-Opt-Guide/registers/histogram32-long.cpp b/Publications/GPU-Opt-Guide/registers/histogram32-long.cpp index ebf5eb1ad8..842403fe06 100644 --- a/Publications/GPU-Opt-Guide/registers/histogram32-long.cpp +++ b/Publications/GPU-Opt-Guide/registers/histogram32-long.cpp @@ -30,7 +30,7 @@ int main() { << "\n"; // Snippet begin - constexpr int blockSize = 256; + constexpr int BLOCK_SIZE = 256; constexpr int NUM_BINS = 32; std::vector hist(NUM_BINS, 0); @@ -41,11 +41,11 @@ int main() { sycl::accessor macc(mbuf, h, sycl::read_only); auto hacc = hbuf.get_access(h); h.parallel_for( - sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}), + sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}), [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] { int group = it.get_group()[0]; int gSize = it.get_local_range()[0]; - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; int sgGroup = sg.get_group_id()[0]; @@ -55,10 +55,10 @@ int main() { for (int k = 0; k < NUM_BINS; k++) { histogram[k] = 0; } - for (int k = 0; k < blockSize; k++) { + for (int k = 0; k < BLOCK_SIZE; k++) { unsigned long x = - sg.load(macc.get_pointer() + group * gSize * blockSize + - sgGroup * sgSize * blockSize + sgSize * k); + sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE + + sgGroup * sgSize * BLOCK_SIZE + sgSize * k); #pragma unroll for (int i = 0; i < 8; i++) { unsigned int c = x & 0x1FU; diff --git a/Publications/GPU-Opt-Guide/slm/histogram-slm-1024.cpp b/Publications/GPU-Opt-Guide/slm/histogram-slm-1024.cpp index 01cbc4e5f7..556e2dfbd7 100644 --- a/Publications/GPU-Opt-Guide/slm/histogram-slm-1024.cpp +++ b/Publications/GPU-Opt-Guide/slm/histogram-slm-1024.cpp @@ -27,7 +27,7 @@ int main() { // Snippet begin constexpr int NUM_BINS = 1024; - constexpr int blockSize = 256; + constexpr int BLOCK_SIZE = 256; std::vector hist(NUM_BINS, 0); sycl::buffer mbuf(input.data(), N); @@ -39,11 +39,11 @@ int main() { sycl::local_accessor local_histogram(sycl::range(NUM_BINS), h); h.parallel_for( - sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}), + sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}), [=](sycl::nd_item<1> it) { int group = it.get_group()[0]; int gSize = it.get_local_range()[0]; - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; int sgGroup = sg.get_group_id()[0]; @@ -66,10 +66,10 @@ int main() { } it.barrier(sycl::access::fence_space::local_space); - for (int k = 0; k < blockSize; k++) { + for (int k = 0; k < BLOCK_SIZE; k++) { unsigned long x = - sg.load(macc.get_pointer() + group * gSize * blockSize + - sgGroup * sgSize * blockSize + sgSize * k); + sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE + + sgGroup * sgSize * BLOCK_SIZE + sgSize * k); #pragma unroll for (std::uint8_t shift : {0, 16, 32, 48}) { constexpr unsigned long mask = 0x3FFU; diff --git a/Publications/GPU-Opt-Guide/slm/histogram-slm-256.cpp b/Publications/GPU-Opt-Guide/slm/histogram-slm-256.cpp index 9a0160cbe0..03e963308f 100644 --- a/Publications/GPU-Opt-Guide/slm/histogram-slm-256.cpp +++ b/Publications/GPU-Opt-Guide/slm/histogram-slm-256.cpp @@ -32,7 +32,7 @@ int main() { // Snippet begin constexpr int NUM_BINS = 256; - constexpr int blockSize = 256; + constexpr int BLOCK_SIZE = 256; std::vector hist(NUM_BINS, 0); sycl::buffer mbuf(input.data(), N); @@ -44,11 +44,11 @@ int main() { sycl::local_accessor local_histogram(sycl::range(NUM_BINS), h); h.parallel_for( - sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}), + sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}), [=](sycl::nd_item<1> it) { int group = it.get_group()[0]; int gSize = it.get_local_range()[0]; - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; int sgGroup = sg.get_group_id()[0]; @@ -71,10 +71,10 @@ int main() { } it.barrier(sycl::access::fence_space::local_space); - for (int k = 0; k < blockSize; k++) { + for (int k = 0; k < BLOCK_SIZE; k++) { unsigned long x = - sg.load(macc.get_pointer() + group * gSize * blockSize + - sgGroup * sgSize * blockSize + sgSize * k); + sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE + + sgGroup * sgSize * BLOCK_SIZE + sgSize * k); #pragma unroll for (std::uint8_t shift : {0, 8, 16, 24, 32, 40, 48, 56}) { constexpr unsigned long mask = 0xFFU; diff --git a/Publications/GPU-Opt-Guide/sub-group/CMakeLists.txt b/Publications/GPU-Opt-Guide/sub-group/CMakeLists.txt index 62d351a18a..4c774ec665 100644 --- a/Publications/GPU-Opt-Guide/sub-group/CMakeLists.txt +++ b/Publications/GPU-Opt-Guide/sub-group/CMakeLists.txt @@ -7,5 +7,6 @@ add_example(sub-group-4) add_example(sub-group-5) add_example(sub-group-6) add_example(sub-group-7) +add_example(sub-group-8) add_example(sub-group-sizes) add_example(transpose) diff --git a/Publications/GPU-Opt-Guide/sub-group/sg-max-size-output.txt b/Publications/GPU-Opt-Guide/sub-group/sg-max-size-output.txt index 13e736b7e0..ab0f3a5235 100644 --- a/Publications/GPU-Opt-Guide/sub-group/sg-max-size-output.txt +++ b/Publications/GPU-Opt-Guide/sub-group/sg-max-size-output.txt @@ -1,7 +1,7 @@ -globalId = 0 sgMaxSize = 8 sgSize = 7 sId = 0 j = 0 k = 7 -globalId = 1 sgMaxSize = 8 sgSize = 7 sId = 1 j = 1 k = 8 -globalId = 2 sgMaxSize = 8 sgSize = 7 sId = 2 j = 2 k = 9 -globalId = 3 sgMaxSize = 8 sgSize = 7 sId = 3 j = 3 k = 10 -globalId = 4 sgMaxSize = 8 sgSize = 7 sId = 4 j = 4 k = 11 -globalId = 5 sgMaxSize = 8 sgSize = 7 sId = 5 j = 5 k = 12 -globalId = 6 sgMaxSize = 8 sgSize = 7 sId = 6 j = 6 k = 13 +globalId = 0 sgMaxSize = 16 sgSize = 7 sId = 0 j = 0 k = 7 +globalId = 1 sgMaxSize = 16 sgSize = 7 sId = 1 j = 1 k = 8 +globalId = 2 sgMaxSize = 16 sgSize = 7 sId = 2 j = 2 k = 9 +globalId = 3 sgMaxSize = 16 sgSize = 7 sId = 3 j = 3 k = 10 +globalId = 4 sgMaxSize = 16 sgSize = 7 sId = 4 j = 4 k = 11 +globalId = 5 sgMaxSize = 16 sgSize = 7 sId = 5 j = 5 k = 12 +globalId = 6 sgMaxSize = 16 sgSize = 7 sId = 6 j = 6 k = 13 diff --git a/Publications/GPU-Opt-Guide/sub-group/sg-max-size.cpp b/Publications/GPU-Opt-Guide/sub-group/sg-max-size.cpp index 0e31390e97..a277003ff3 100644 --- a/Publications/GPU-Opt-Guide/sub-group/sg-max-size.cpp +++ b/Publications/GPU-Opt-Guide/sub-group/sg-max-size.cpp @@ -72,9 +72,9 @@ int main() { auto e = q.submit([&](auto &h) { sycl::stream out(65536, 128, h); h.parallel_for(sycl::nd_range<1>(7, 7), - [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(8)]] { + [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] { int i = it.get_global_linear_id(); - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; int sgMaxSize = sg.get_max_local_range()[0]; int sId = sg.get_local_id()[0]; diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-0.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-0.cpp index 14583e18f7..fafe8070d4 100644 --- a/Publications/GPU-Opt-Guide/sub-group/sub-group-0.cpp +++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-0.cpp @@ -17,7 +17,7 @@ int main() { [=](sycl::nd_item<1> it) { int groupId = it.get_group(0); int globalId = it.get_global_linear_id(); - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; int sgGroupId = sg.get_group_id()[0]; int sgId = sg.get_local_id()[0]; diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-1.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-1.cpp index 4aefce6bbf..0c9eea3c08 100644 --- a/Publications/GPU-Opt-Guide/sub-group/sub-group-1.cpp +++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-1.cpp @@ -17,7 +17,7 @@ int main() { [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(32)]] { int groupId = it.get_group(0); int globalId = it.get_global_linear_id(); - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; int sgGroupId = sg.get_group_id()[0]; int sgId = sg.get_local_id()[0]; diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-3.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-3.cpp index 9bad3ffdf8..2d15ca038d 100644 --- a/Publications/GPU-Opt-Guide/sub-group/sub-group-3.cpp +++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-3.cpp @@ -20,7 +20,7 @@ int main() { h.parallel_for(sycl::nd_range(sycl::range{N / 16}, sycl::range{32}), [=](sycl::nd_item<1> it) { int i = it.get_global_linear_id(); - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; i = (i / sgSize) * sgSize * 16 + (i % sgSize); for (int j = 0; j < sgSize * 16; j += sgSize) { diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-5.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-5.cpp index 7d1178b43d..4120ace227 100644 --- a/Publications/GPU-Opt-Guide/sub-group/sub-group-5.cpp +++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-5.cpp @@ -23,7 +23,7 @@ int main() { h.parallel_for(sycl::nd_range(sycl::range{N / 16}, sycl::range{32}), [=](sycl::nd_item<1> it) { int i = it.get_global_linear_id(); - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); int sgSize = sg.get_local_range()[0]; i = (i / sgSize) * sgSize * 16 + (i % sgSize); for (int j = 0; j < sgSize * 16; j += sgSize) { diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-6.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-6.cpp index 6862e13572..1d7dc7c86a 100644 --- a/Publications/GPU-Opt-Guide/sub-group/sub-group-6.cpp +++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-6.cpp @@ -22,7 +22,7 @@ int main() { h.parallel_for( sycl::nd_range(sycl::range{N / 16}, sycl::range{32}), [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] { - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); sycl::vec x; using global_ptr = diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-7.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-7.cpp index 83da19af88..9575622ff8 100644 --- a/Publications/GPU-Opt-Guide/sub-group/sub-group-7.cpp +++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-7.cpp @@ -22,7 +22,7 @@ int main() { h.parallel_for( sycl::nd_range(sycl::range{N / 16}, sycl::range{32}), [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] { - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); sycl::vec x; int base = (it.get_group(0) * 32 + diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-8.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-8.cpp new file mode 100644 index 0000000000..b7f0d6c993 --- /dev/null +++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-8.cpp @@ -0,0 +1,49 @@ +//============================================================== +// Copyright © 2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include + +int main() { + sycl::queue q{sycl::gpu_selector_v, + sycl::property::queue::enable_profiling{}}; + + std::cout << "Device: " << q.get_device().get_info() + << std::endl; + + // Snippet begin + constexpr int N = 1024 * 1024; + int *data = sycl::malloc_shared(N, q); + int *data2 = sycl::malloc_shared(N, q); + memset(data2, 0xFF, sizeof(int) * N); + + auto e = q.submit([&](auto &h) { + h.parallel_for(sycl::nd_range(sycl::range{N / 16}, sycl::range{32}), + [=](sycl::nd_item<1> it) { + int i = it.get_global_linear_id(); + auto sg = it.get_sub_group(); + int sgSize = sg.get_local_range()[0]; + i = (i / sgSize) * sgSize * 16 + (i % sgSize) * 4; + for (int j = 0; j < 4; j++) { + sycl::vec x; + sycl::vec *q = + (sycl::vec *)(&(data2[i + j * sgSize * 4])); + x = *q; + sycl::vec *r = + (sycl::vec *)(&(data[i + j * sgSize * 4])); + *r = x; + } + }); + }); + // Snippet end + q.wait(); + std::cout << "Kernel time = " + << (e.template get_profiling_info< + sycl::info::event_profiling::command_end>() - + e.template get_profiling_info< + sycl::info::event_profiling::command_start>()) + << " ns" << std::endl; + return 0; +} diff --git a/Publications/GPU-Opt-Guide/sub-group/transpose.cpp b/Publications/GPU-Opt-Guide/sub-group/transpose.cpp index 11d719534b..7724d63a8c 100644 --- a/Publications/GPU-Opt-Guide/sub-group/transpose.cpp +++ b/Publications/GPU-Opt-Guide/sub-group/transpose.cpp @@ -34,44 +34,44 @@ int main() { { // Snippet begin - constexpr size_t blockSize = 16; + constexpr size_t BLOCK_SIZE = 16; sycl::buffer m(matrix.data(), sycl::range<2>(N, N)); auto e = q.submit([&](auto &h) { sycl::accessor marr(m, h); - sycl::local_accessor barr1(sycl::range<2>(blockSize, blockSize), - h); - sycl::local_accessor barr2(sycl::range<2>(blockSize, blockSize), - h); + sycl::local_accessor barr1( + sycl::range<2>(BLOCK_SIZE, BLOCK_SIZE), h); + sycl::local_accessor barr2( + sycl::range<2>(BLOCK_SIZE, BLOCK_SIZE), h); h.parallel_for( - sycl::nd_range<2>(sycl::range<2>(N / blockSize, N), - sycl::range<2>(1, blockSize)), + sycl::nd_range<2>(sycl::range<2>(N / BLOCK_SIZE, N), + sycl::range<2>(1, BLOCK_SIZE)), [=](sycl::nd_item<2> it) [[intel::reqd_sub_group_size(16)]] { int gi = it.get_group(0); int gj = it.get_group(1); - sycl::ext::oneapi::sub_group sg = it.get_sub_group(); + auto sg = it.get_sub_group(); uint sgId = sg.get_local_id()[0]; - uint bcol[blockSize]; - int ai = blockSize * gi; - int aj = blockSize * gj; + uint bcol[BLOCK_SIZE]; + int ai = BLOCK_SIZE * gi; + int aj = BLOCK_SIZE * gj; - for (uint k = 0; k < blockSize; k++) { + for (uint k = 0; k < BLOCK_SIZE; k++) { bcol[k] = sg.load(marr.get_pointer() + (ai + k) * N + aj); } - uint tcol[blockSize]; - for (uint n = 0; n < blockSize; n++) { + uint tcol[BLOCK_SIZE]; + for (uint n = 0; n < BLOCK_SIZE; n++) { if (sgId == n) { - for (uint k = 0; k < blockSize; k++) { + for (uint k = 0; k < BLOCK_SIZE; k++) { tcol[k] = sg.shuffle(bcol[n], k); } } } - for (uint k = 0; k < blockSize; k++) { + for (uint k = 0; k < BLOCK_SIZE; k++) { sg.store(marr.get_pointer() + (ai + k) * N + aj, tcol[k]); } });