From 0cd135b4981d08d8d2522704f63f22ef6560dd6e Mon Sep 17 00:00:00 2001
From: "Tu, Peng" <peng.tu@intel.com>
Date: Tue, 14 Mar 2023 11:50:30 -0700
Subject: [PATCH] Update GPU-Opt-Guide examples to 2023.03.14

---
 .../06_scalars_private/test_scalars.cpp       |   6 +-
 .../test_scalars_private.cpp                  |   6 +-
 .../test_scalars_private_2.cpp                |   6 +-
 .../OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp      |   8 +-
 .../23_omp_work_group/test_omp_work_group.cpp |   9 +-
 .../OpenMP/26_omp_prefetch/CMakeLists.txt     |   5 +
 .../OpenMP/26_omp_prefetch/c/CMakeLists.txt   |   1 +
 .../OpenMP/26_omp_prefetch/c/nbody_c.cpp      | 150 ++++++++++++++++
 .../26_omp_prefetch/c_simd/CMakeLists.txt     |   2 +
 .../26_omp_prefetch/c_simd/nbody_c_simd.cpp   | 169 ++++++++++++++++++
 .../26_omp_prefetch/fortran/CMakeLists.txt    |   2 +
 .../OpenMP/26_omp_prefetch/fortran/nbody_f.f  | 159 ++++++++++++++++
 .../GPU-Opt-Guide/OpenMP/CMakeLists.txt       |   1 +
 Publications/GPU-Opt-Guide/atomics/align.hpp  |   5 +
 .../GPU-Opt-Guide/atomics/atomics.cpp         |  18 +-
 .../atomics/global_atomics_ref.cpp            |   7 +-
 .../atomics/local_atomics_ref.cpp             |  17 +-
 .../GPU-Opt-Guide/atomics/test_atomic.cpp     |  28 ++-
 .../GPU-Opt-Guide/exec-model/vaddsync.cpp     |   6 +-
 .../GPU-Opt-Guide/exec-model/vec-add.cpp      |   4 +-
 .../grf-mode-selection/CMakeLists.txt         |   1 -
 .../implicit-scaling/03_stream/stream.cpp     |  14 +-
 .../04_stream_3D/stream_3D.cpp                |  13 +-
 .../stream_cross_stack.cpp                    |  95 ++++++++++
 .../06_cross_stack/cross_stack_01.cpp         |  13 ++
 .../libraries-fcorr/fcorr_1d_buffers.cpp      |   3 +
 .../libraries-fcorr/fcorr_1d_usm.cpp          |   3 +
 .../libraries-fcorr/fcorr_1d_usm_fixed.cpp    |   3 +
 .../libraries-stdlib/external_rand.cpp        |   4 +-
 .../libraries-stdlib/rng_test.cpp             |   3 +
 .../local-global-sync/atomics.cpp             |  14 +-
 .../registers/block-load-store.cpp            |   2 +-
 .../histogram256-int-shared-private.cpp       |  12 +-
 .../registers/histogram256-int.cpp            |  12 +-
 .../registers/histogram32-int-volatile.cpp    |  12 +-
 .../registers/histogram32-int.cpp             |  12 +-
 .../registers/histogram32-long.cpp            |  12 +-
 .../GPU-Opt-Guide/slm/histogram-slm-1024.cpp  |  12 +-
 .../GPU-Opt-Guide/slm/histogram-slm-256.cpp   |  12 +-
 .../GPU-Opt-Guide/sub-group/CMakeLists.txt    |   1 +
 .../sub-group/sg-max-size-output.txt          |  14 +-
 .../GPU-Opt-Guide/sub-group/sg-max-size.cpp   |   4 +-
 .../GPU-Opt-Guide/sub-group/sub-group-0.cpp   |   2 +-
 .../GPU-Opt-Guide/sub-group/sub-group-1.cpp   |   2 +-
 .../GPU-Opt-Guide/sub-group/sub-group-3.cpp   |   2 +-
 .../GPU-Opt-Guide/sub-group/sub-group-5.cpp   |   2 +-
 .../GPU-Opt-Guide/sub-group/sub-group-6.cpp   |   2 +-
 .../GPU-Opt-Guide/sub-group/sub-group-7.cpp   |   2 +-
 .../GPU-Opt-Guide/sub-group/sub-group-8.cpp   |  49 +++++
 .../GPU-Opt-Guide/sub-group/transpose.cpp     |  32 ++--
 50 files changed, 819 insertions(+), 154 deletions(-)
 create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/CMakeLists.txt
 create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/CMakeLists.txt
 create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/nbody_c.cpp
 create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/CMakeLists.txt
 create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/nbody_c_simd.cpp
 create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/CMakeLists.txt
 create mode 100644 Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/nbody_f.f
 create mode 100644 Publications/GPU-Opt-Guide/implicit-scaling/05_stream_cross_stack/stream_cross_stack.cpp
 create mode 100644 Publications/GPU-Opt-Guide/implicit-scaling/06_cross_stack/cross_stack_01.cpp
 create mode 100644 Publications/GPU-Opt-Guide/sub-group/sub-group-8.cpp

diff --git a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars.cpp b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars.cpp
index 9314b38497..bf436533b1 100644
--- a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars.cpp
+++ b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars.cpp
@@ -18,7 +18,7 @@
 #define MAX 100
 #define scaled_rand() ((rand() % MAX) / (1.0 * MAX))
 
-#define IDX2(i, j) (i * P + j)
+#define IDX2(i, j) (i * j)
 #define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k)
 
 int main(void) {
@@ -43,7 +43,7 @@ int main(void) {
     dx[i] = scaled_rand();
 
   /* map data to device */
-  #pragma omp target enter data map(to : u [0:SIZE], dx [0:P * P])
+  #pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P])
 
   start = omp_get_wtime();
 
@@ -58,7 +58,7 @@ int main(void) {
           double us = 0.;
           double ut = 0.;
 
-          s1 = dx[IDX4(b, 0, 0, k)];
+          s1 = dx[IDX2(b, k)];
           s2 = u[IDX4(b, 0, 0, k)] + BLOCKS;
           s3 = 0.145;
 
diff --git a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private.cpp b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private.cpp
index 533be8fd33..db1731b354 100644
--- a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private.cpp
+++ b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private.cpp
@@ -18,7 +18,7 @@
 #define MAX 100
 #define scaled_rand() ((rand() % MAX) / (1.0 * MAX))
 
-#define IDX2(i, j) (i * P + j)
+#define IDX2(i, j) (i * j)
 #define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k)
 
 int main(void) {
@@ -43,7 +43,7 @@ int main(void) {
     dx[i] = scaled_rand();
 
   /* map data to device */
-  #pragma omp target enter data map(to: u[0:SIZE], dx[0:P * P])
+  #pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P])
 
   start = omp_get_wtime();
 
@@ -58,7 +58,7 @@ int main(void) {
           double us = 0.;
           double ut = 0.;
 
-          s1 = dx[IDX4(b, 0, 0, k)];
+          s1 = dx[IDX2(b, k)];
           s2 = u[IDX4(b, 0, 0, k)] + BLOCKS;
           s3 = 0.145;
 
diff --git a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private_2.cpp b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private_2.cpp
index bc412e7756..78f885bc42 100644
--- a/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private_2.cpp
+++ b/Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private_2.cpp
@@ -18,7 +18,7 @@
 #define MAX 100
 #define scaled_rand() ((rand() % MAX) / (1.0 * MAX))
 
-#define IDX2(i, j) (i * P + j)
+#define IDX2(i, j) (i * j)
 #define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k)
 
 int main(void) {
@@ -42,7 +42,7 @@ int main(void) {
     dx[i] = scaled_rand();
 
   /* map data to device */
-  #pragma omp target enter data map(to: u[0:SIZE], dx[0:P * P])
+  #pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P])
 
   start = omp_get_wtime();
 
@@ -57,7 +57,7 @@ int main(void) {
           double us = 0.;
           double ut = 0.;
 
-          double s1 = dx[IDX4(b, 0, 0, k)];
+          double s1 = dx[IDX2(b, k)];
           double s2 = u[IDX4(b, 0, 0, k)] + BLOCKS;
           double s3 = 0.145;
 
diff --git a/Publications/GPU-Opt-Guide/OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp b/Publications/GPU-Opt-Guide/OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp
index 3dcc46e67b..c273ad2cba 100644
--- a/Publications/GPU-Opt-Guide/OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp
+++ b/Publications/GPU-Opt-Guide/OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp
@@ -147,11 +147,15 @@ int main(int argc, char **argv) {
     return EXIT_FAILURE;
   }
 
+  FLOAT alpha, beta;
+  int niter, verify;
   int HA = atoi(argv[1]);
   int WA = atoi(argv[2]);
   int WB = atoi(argv[3]);
-  FLOAT alpha, beta;
-  int niter, verify;
+
+  if ((HA == 0) || (WA == 0) || (WB == 0))
+    exit(1);
+
   if (argc > 4) {
 
 #if PRECISION == 1
diff --git a/Publications/GPU-Opt-Guide/OpenMP/23_omp_work_group/test_omp_work_group.cpp b/Publications/GPU-Opt-Guide/OpenMP/23_omp_work_group/test_omp_work_group.cpp
index e8f753bcd3..18fc41df57 100644
--- a/Publications/GPU-Opt-Guide/OpenMP/23_omp_work_group/test_omp_work_group.cpp
+++ b/Publications/GPU-Opt-Guide/OpenMP/23_omp_work_group/test_omp_work_group.cpp
@@ -1,5 +1,5 @@
 //==============================================================
-// Copyright © 2022 Intel Corporation
+// Copyright © 203 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 // =============================================================
@@ -9,6 +9,9 @@
 
 double * make_array(int n, double value) {
   double* array = static_cast<double*>(malloc(n * sizeof(double)));
+  if (array == NULL)
+    return NULL;
+
   for (int i = 0; i < n; i++) {
     array[i] = value / (100.0 + i);
   }
@@ -23,6 +26,8 @@ int main() {
   double* A = make_array(N, 0.8);
   double* B = make_array(N, 0.65);
   double* C = make_array(N*N, 2.5);
+  if ((A == NULL) || (B == NULL) || (C == NULL))
+    exit(1);
 
   int i, j;
   double val = 0.0;
@@ -38,7 +43,7 @@ int main() {
     }
   }
 
-  printf("Reduced val[%f10.3]", val);
+  printf("val = %f10.3\n", val);
 
   free(A);
   free(B);
diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/CMakeLists.txt b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/CMakeLists.txt
new file mode 100644
index 0000000000..a25b5cb077
--- /dev/null
+++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_compile_options(-g -mcmodel=medium)
+add_definitions(-DPREFETCH)
+add_subdirectory(c)
+add_subdirectory(fortran)
+add_subdirectory(c_simd)
diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/CMakeLists.txt b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/CMakeLists.txt
new file mode 100644
index 0000000000..1d36e7ae22
--- /dev/null
+++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/CMakeLists.txt
@@ -0,0 +1 @@
+add_example(nbody_c)
diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/nbody_c.cpp b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/nbody_c.cpp
new file mode 100644
index 0000000000..156a4f451e
--- /dev/null
+++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/nbody_c.cpp
@@ -0,0 +1,150 @@
+#include <math.h>
+#include <omp.h>
+#include <stdio.h>
+
+#define CACHE_CLEAN_SIZE 100000000
+#define ITERATIONS 100
+#define ARRAYLEN1 4096
+#define ARRAYLEN2 32768
+// snippet-begin
+#define WORKGROUP_SIZE 1024
+#define PREFETCH_HINT 4 // 4 = prefetch to L1 and L3;  2 = prefetch to L3
+#define TILE_SIZE 64
+
+void nbody_1d_gpu(float *c, float *a, float *b, int n1, int n2) {
+#pragma omp target teams distribute parallel for thread_limit(WORKGROUP_SIZE)
+  for (int i = 0; i < n1; i++) {
+    const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f;
+    const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f;
+    const float eps = 0.01f;
+
+    float dx = 0.0;
+    float bb[TILE_SIZE];
+    for (int j = 0; j < n2; j += TILE_SIZE) {
+      // load tile from b
+      for (int u = 0; u < TILE_SIZE; ++u) {
+        bb[u] = b[j + u];
+#ifdef PREFETCH
+        int next_tile = j + TILE_SIZE + u;
+        if ((next_tile % 16) == 0) {
+#pragma ompx prefetch data(PREFETCH_HINT : b[next_tile]) if (next_tile < n2)
+        }
+#endif
+      }
+#pragma unroll(TILE_SIZE)
+      for (int u = 0; u < TILE_SIZE; ++u) {
+        float delta = bb[u] - a[i];
+        float r2 = delta * delta;
+        float s0 = r2 + eps;
+        float s1 = 1.0f / sqrtf(s0);
+        float f =
+            (s1 * s1 * s1) -
+            (ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5)))));
+        dx += f * delta;
+      }
+    }
+    c[i] = dx * 0.23f;
+  }
+}
+// snippet-end
+
+void nbody_1d_cpu(float *c, float *a, float *b, int n1, int n2) {
+  for (int i = 0; i < n1; ++i) {
+    const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f;
+    const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f;
+    const float eps = 0.01f;
+
+    float dx = 0.0f;
+    for (int j = 0; j < n2; ++j) {
+      float delta = b[j] - a[i];
+      float r2 = delta * delta;
+      float s0 = r2 + eps;
+      float s1 = 1.0f / sqrtf(s0);
+      float f = (s1 * s1 * s1) -
+                (ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5)))));
+      dx += f * delta;
+    }
+    c[i] = dx * 0.23f;
+  }
+}
+
+void clean_cache_gpu(double *d, int n) {
+
+#pragma omp target teams distribute parallel for thread_limit(1024)
+  for (unsigned i = 0; i < n; ++i)
+    d[i] = i;
+
+  return;
+}
+
+int main() {
+
+  float *a, *b, *c;
+  double *d;
+
+  a = new float[ARRAYLEN1];
+  b = new float[ARRAYLEN2];
+  c = new float[ARRAYLEN1];
+  d = new double[CACHE_CLEAN_SIZE];
+
+  // intialize
+  float dx = 1.0f / (float)ARRAYLEN2;
+  b[0] = 0.0f;
+  for (int i = 1; i < ARRAYLEN2; ++i) {
+    b[i] = b[i - 1] + dx;
+  }
+  for (int i = 0; i < ARRAYLEN1; ++i) {
+    a[i] = b[i];
+    c[i] = 0.0f;
+  }
+
+#pragma omp target
+  {}
+
+#pragma omp target enter data map(alloc                                        \
+                                  : a [0:ARRAYLEN1], b [0:ARRAYLEN2],          \
+                                    c [0:ARRAYLEN1])
+#pragma omp target enter data map(alloc : d [0:CACHE_CLEAN_SIZE])
+
+#pragma omp target update to(a [0:ARRAYLEN1], b [0:ARRAYLEN2])
+
+  double t1, t2, elapsed_s = 0.0;
+  for (int i = 0; i < ITERATIONS; ++i) {
+    clean_cache_gpu(d, CACHE_CLEAN_SIZE);
+
+    t1 = omp_get_wtime();
+    nbody_1d_gpu(c, a, b, ARRAYLEN1, ARRAYLEN2);
+    t2 = omp_get_wtime();
+
+    elapsed_s += (t2 - t1);
+  }
+
+#pragma omp target update from(c [0:ARRAYLEN1])
+
+  double sum = 0.0f;
+  for (int i = 0; i < ARRAYLEN1; ++i)
+    sum += c[i];
+  printf("Obtained output = %8.3f\n", sum);
+
+  for (int i = 0; i < ARRAYLEN1; ++i)
+    c[i] = 0.0f;
+  nbody_1d_cpu(c, a, b, ARRAYLEN1, ARRAYLEN2);
+  sum = 0.0f;
+  for (int i = 0; i < ARRAYLEN1; ++i)
+    sum += c[i];
+  printf("Expected output = %8.3f\n", sum);
+
+  printf("\nTotal time = %8.1f milliseconds\n", (elapsed_s * 1000));
+
+#pragma omp target exit data map(delete                                        \
+                                 : a [0:ARRAYLEN1], b [0:ARRAYLEN2],           \
+                                   c [0:ARRAYLEN1])
+#pragma omp target exit data map(delete : d [0:CACHE_CLEAN_SIZE])
+
+  delete[] a;
+  delete[] b;
+  delete[] c;
+  delete[] d;
+
+  return 0;
+}
diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/CMakeLists.txt b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/CMakeLists.txt
new file mode 100644
index 0000000000..6ec8e627ac
--- /dev/null
+++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_compile_options(-fopenmp-target-simd)
+add_example(nbody_c_simd)
diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/nbody_c_simd.cpp b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/nbody_c_simd.cpp
new file mode 100644
index 0000000000..fc6cffffd8
--- /dev/null
+++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c_simd/nbody_c_simd.cpp
@@ -0,0 +1,169 @@
+#include <chrono>
+#include <math.h>
+#include <omp.h>
+#include <stdio.h>
+
+#define CACHE_CLEAN_SIZE 100000000
+#define ITERATIONS 100
+#define ARRAYLEN1 4096
+#define ARRAYLEN2 32768
+#define VECLEN 16
+// snippet-begin
+#define WORKGROUP_SIZE 1024
+#define PREFETCH_HINT 4 // 4 = prefetch to L1 and L3;  2 = prefetch to L3
+#define TILE_SIZE 64
+
+void nbody_1d_gpu(float *c, float *a, float *b, int n1, int n2) {
+#pragma omp target teams distribute parallel for thread_limit(WORKGROUP_SIZE / \
+                                                              VECLEN)
+  for (int i = 0; i < n1; i += VECLEN) {
+    const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f;
+    const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f;
+    const float eps = 0.01f;
+
+    float dx[VECLEN];
+    float aa[VECLEN], bb[TILE_SIZE];
+#pragma omp simd simdlen(VECLEN)
+#pragma unroll(0)
+    for (int v = 0; v < VECLEN; ++v) {
+      dx[v] = 0.0f;
+      aa[v] = a[i + v];
+    }
+    for (int j = 0; j < n2; j += TILE_SIZE) {
+      // load tile from b
+      for (int u = 0; u < TILE_SIZE; u += VECLEN) {
+#pragma omp simd simdlen(VECLEN)
+#pragma unroll(0)
+        for (int v = 0; v < VECLEN; ++v)
+          bb[u + v] = b[j + u + v];
+#ifdef PREFETCH
+        int next_tile = j + TILE_SIZE + u;
+#pragma ompx prefetch data(PREFETCH_HINT : b[next_tile]) if (next_tile < n2)
+#endif
+      }
+// compute current tile
+#pragma omp simd simdlen(VECLEN)
+#pragma unroll(0)
+      for (int v = 0; v < VECLEN; ++v) {
+#pragma unroll(TILE_SIZE)
+        for (int u = 0; u < TILE_SIZE; ++u) {
+          float delta = bb[u] - aa[v];
+          float r2 = delta * delta;
+          float s0 = r2 + eps;
+          float s1 = 1.0f / sqrtf(s0);
+          float f =
+              (s1 * s1 * s1) -
+              (ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5)))));
+          dx[v] += f * delta;
+        }
+      }
+    }
+#pragma omp simd simdlen(VECLEN)
+#pragma unroll(0)
+    for (int v = 0; v < VECLEN; ++v) {
+      c[i + v] = dx[v] * 0.23f;
+    }
+  }
+}
+// snippet-end
+
+void nbody_1d_cpu(float *c, float *a, float *b, int n1, int n2) {
+  for (int i = 0; i < n1; ++i) {
+    const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f;
+    const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f;
+    const float eps = 0.01f;
+
+    float dx = 0.0f;
+    for (int j = 0; j < n2; ++j) {
+      float delta = b[j] - a[i];
+      float r2 = delta * delta;
+      float s0 = r2 + eps;
+      float s1 = 1.0f / sqrtf(s0);
+      float f = (s1 * s1 * s1) -
+                (ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5)))));
+      dx += f * delta;
+    }
+    c[i] = dx * 0.23f;
+  }
+}
+
+void clean_cache_gpu(double *d, int n) {
+
+#pragma omp target teams distribute parallel for thread_limit(1024)
+  for (unsigned i = 0; i < n; ++i)
+    d[i] = i;
+
+  return;
+}
+
+int main() {
+
+  float *a, *b, *c;
+  double *d;
+
+  a = new float[ARRAYLEN1];
+  b = new float[ARRAYLEN2];
+  c = new float[ARRAYLEN1];
+  d = new double[CACHE_CLEAN_SIZE];
+
+  // intialize
+  float dx = 1.0f / (float)ARRAYLEN2;
+  b[0] = 0.0f;
+  for (int i = 1; i < ARRAYLEN2; ++i) {
+    b[i] = b[i - 1] + dx;
+  }
+  for (int i = 0; i < ARRAYLEN1; ++i) {
+    a[i] = b[i];
+    c[i] = 0.0f;
+  }
+
+#pragma omp target
+  {}
+
+#pragma omp target enter data map(alloc                                        \
+                                  : a [0:ARRAYLEN1], b [0:ARRAYLEN2],          \
+                                    c [0:ARRAYLEN1])
+#pragma omp target enter data map(alloc : d [0:CACHE_CLEAN_SIZE])
+
+#pragma omp target update to(a [0:ARRAYLEN1], b [0:ARRAYLEN2])
+
+  double t1, t2, elapsed_s = 0.0;
+  for (int i = 0; i < ITERATIONS; ++i) {
+    clean_cache_gpu(d, CACHE_CLEAN_SIZE);
+
+    t1 = omp_get_wtime();
+    nbody_1d_gpu(c, a, b, ARRAYLEN1, ARRAYLEN2);
+    t2 = omp_get_wtime();
+
+    elapsed_s += (t2 - t1);
+  }
+
+#pragma omp target update from(c [0:ARRAYLEN1])
+
+  float sum = 0.0f;
+  for (int i = 0; i < ARRAYLEN1; ++i)
+    sum += c[i];
+  printf("Obtained output = %8.3f\n", sum);
+
+  for (int i = 0; i < ARRAYLEN1; ++i)
+    c[i] = 0.0f;
+  nbody_1d_cpu(c, a, b, ARRAYLEN1, ARRAYLEN2);
+  sum = 0.0f;
+  for (int i = 0; i < ARRAYLEN1; ++i)
+    sum += c[i];
+  printf("Expected output = %8.3f\n", sum);
+
+  printf("\nTotal time = %8.1f milliseconds\n", (elapsed_s * 1000));
+
+#pragma omp target exit data map(delete                                        \
+                                 : a [0:ARRAYLEN1], b [0:ARRAYLEN2],           \
+                                   c [0:ARRAYLEN1])
+#pragma omp target exit data map(delete : d [0:CACHE_CLEAN_SIZE])
+
+  delete[] a;
+  delete[] b;
+  delete[] c;
+  delete[] d;
+
+  return 0;
+}
diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/CMakeLists.txt b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/CMakeLists.txt
new file mode 100644
index 0000000000..e9106a7dda
--- /dev/null
+++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_compile_options(-fpconstant -fpp -ffast-math -fno-sycl-instrument-device-code)
+add_fortran_example_with_mkl(nbody_f)
diff --git a/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/nbody_f.f b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/nbody_f.f
new file mode 100644
index 0000000000..a893aa7d8c
--- /dev/null
+++ b/Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/fortran/nbody_f.f
@@ -0,0 +1,159 @@
+#define CACHE_CLEAN_SIZE 100000000
+#define ITERATIONS 100
+#define ARRAYLEN1 4096
+#define ARRAYLEN2 32768
+c snippet-begin
+#define WORKGROUP_SIZE 1024
+#define PREFETCH_HINT 4     ! 4 = prefetch to L1 and L3;  2 = prefetch to L3
+#define TILE_SIZE 64
+
+      subroutine nbody_1d_gpu(c, a, b, n1, n2)
+      implicit none
+      integer n1, n2
+      real a(0:n1-1), b(0:n2-1), c(0:n1-1)
+      real dx, bb(0:TILE_SIZE-1), delta, r2, s0, s1, f
+      integer i,j,u,next
+      real ma0, ma1, ma2, ma3, ma4, ma5, eps
+      parameter (ma0=0.269327, ma1=-0.0750978, ma2=0.0114808)
+      parameter (ma3=-0.00109313, ma4=0.0000605491, ma5=-0.00000147177)
+      parameter (eps=0.01)
+
+!$omp target teams distribute parallel do thread_limit(WORKGROUP_SIZE)
+!$omp& private(i,dx,j,u,bb,next,delta,r2,s0,s1,f)
+      do i = 0, n1-1
+        dx = 0.0
+        do j = 0, n2-1, TILE_SIZE
+          ! load tile from b
+          do u = 0, TILE_SIZE-1
+            bb(u) = b(j+u)
+#ifdef PREFETCH
+            next = j + TILE_SIZE + u
+            if (mod(next,16).eq.0) then
+!$omp prefetch data(PREFETCH_HINT:b(next:next))if(next<n2)
+            endif
+#endif
+          enddo
+          ! compute
+          !DIR$ unroll(TILE_SIZE)
+          do u = 0, TILE_SIZE-1
+            delta = bb(u) - a(i)
+            r2 = delta*delta
+            s0 = r2 + eps
+            s1 = 1.0 / sqrt(s0)
+            f = (s1*s1*s1)-(ma0+r2*(ma1+r2*(ma2+r2*(ma3+r2*(ma4+ma5)))))
+            dx = dx + f*delta
+          enddo
+        enddo
+        c(i) = dx*0.23
+      enddo
+      end subroutine
+c snippet-end
+
+      subroutine nbody_1d_cpu(c, a, b, n1, n2)
+      implicit none
+      integer n1, n2
+      real a(0:n1), b(0:n2), c(0:n1)
+      real dx, bb(0:TILE_SIZE), delta, r2, s0, s1, f
+      integer i,j
+      real ma0, ma1, ma2, ma3, ma4, ma5, eps
+      parameter (ma0=0.269327, ma1=-0.0750978, ma2=0.0114808)
+      parameter (ma3=-0.00109313, ma4=0.0000605491, ma5=-0.00000147177)
+      parameter (eps=0.01)
+      do i = 0, n1-1
+        dx = 0.0
+        do j = 0, n2-1
+          ! compute
+          delta = b(j) - a(i)
+          r2 = delta*delta
+          s0 = r2 + eps
+          s1 = 1.0 / sqrt(s0)
+          f = (s1*s1*s1)-(ma0+r2*(ma1+r2*(ma2+r2*(ma3+r2*(ma4+ma5)))))
+          dx = dx + f*delta
+        enddo
+        c(i) = dx*0.23
+      enddo
+      end subroutine
+
+
+      subroutine clean_cache_gpu(d,n)
+      implicit none
+      real d(1)
+      integer n, i
+!$omp target teams distribute parallel do thread_limit(1024)
+      do i=1,n
+         d(i) = i
+      end do
+!$omp end target teams distribute parallel do
+      end subroutine
+
+      program nbody
+      implicit none
+      include 'omp_lib.h'
+
+      real, dimension(:), allocatable :: a,b,c,d
+      double precision sum1
+      real dx,dummy
+      integer i
+      integer N1, N2
+      double precision t1, t2, elapsed_s
+      parameter (N1=ARRAYLEN1, N2=ARRAYLEN2)
+
+      allocate(a(N1),b(N2),c(N1))
+      allocate(d(CACHE_CLEAN_SIZE))
+
+      !initialize
+      dx = 1.0/N2
+      b(1) = 0.0
+      do i = 2, N2
+        b(i) = b(i-1) + dx
+      enddo
+      do i = 1, N1
+        a(i) = b(i)
+        c(i) = 0.0
+      enddo
+
+      call omp_set_default_device(0)
+
+!$omp target
+      dummy = 0.0
+!$omp end target
+
+      elapsed_s = 0.0
+!$omp target enter data map(alloc:a(1:N1),b(1:N2),c(1:N1))
+!$omp target enter data map(alloc:d(1:CACHE_CLEAN_SIZE))
+!$omp target update to(a(1:N1),b(1:N2))
+      do i = 1, ITERATIONS
+         call clean_cache_gpu(d,CACHE_CLEAN_SIZE)
+         t1 = omp_get_wtime()
+         call nbody_1d_gpu(c,a,b,N1,N2)
+         t2 = omp_get_wtime()
+         elapsed_s = elapsed_s + (t2 - t1)
+      enddo
+!$omp target update from(c(1:N1))
+
+      sum1=0.0
+      do i = 1,N1
+        sum1 = sum1 + c(i)
+      enddo
+      print '(/,4x,"Obtained output = ",f15.3)',sum1
+
+      do i = 1,N1
+        c(i) = 0.0
+      enddo
+      call nbody_1d_cpu(c,a,b,N1,N2)
+
+      sum1=0.0
+      do i = 1,N1
+        sum1 = sum1 + c(i)
+      enddo
+      print '(/,4x,"Expected output = ",f15.3)',sum1
+
+      print '(//,4x,"Total time = ",f8.1," milliseconds")',
+     +  elapsed_s*1000
+
+!$omp target exit data map(delete:a(1:N1),b(1:N2),c(1:N1))
+!$omp target exit data map(delete:d(1:CACHE_CLEAN_SIZE))
+
+      deallocate(a,b,c,d)
+
+      end program nbody
diff --git a/Publications/GPU-Opt-Guide/OpenMP/CMakeLists.txt b/Publications/GPU-Opt-Guide/OpenMP/CMakeLists.txt
index 321124be0d..99cdb60b80 100644
--- a/Publications/GPU-Opt-Guide/OpenMP/CMakeLists.txt
+++ b/Publications/GPU-Opt-Guide/OpenMP/CMakeLists.txt
@@ -20,3 +20,4 @@ add_subdirectory(24_device_ptr_addr_clauses)
 add_subdirectory(25_NWChem_based_example_1)
 add_subdirectory(25_NWChem_based_example_2)
 add_subdirectory(25_NWChem_based_example_3)
+add_subdirectory(26_omp_prefetch)
diff --git a/Publications/GPU-Opt-Guide/atomics/align.hpp b/Publications/GPU-Opt-Guide/atomics/align.hpp
index 10eac51d5f..9ffbfb7fd4 100644
--- a/Publications/GPU-Opt-Guide/atomics/align.hpp
+++ b/Publications/GPU-Opt-Guide/atomics/align.hpp
@@ -6,6 +6,11 @@
 #ifndef __ALIGN
 #define __ALIGN 1
 
+#include <CL/sycl.hpp>
+#include <array>
+#include <chrono>
+#include <iostream>
+
 enum class Alignment : size_t {
   Normal = sizeof(void *),
   SSE = 16,
diff --git a/Publications/GPU-Opt-Guide/atomics/atomics.cpp b/Publications/GPU-Opt-Guide/atomics/atomics.cpp
index f3a2c6858d..3e33a4eab6 100644
--- a/Publications/GPU-Opt-Guide/atomics/atomics.cpp
+++ b/Publications/GPU-Opt-Guide/atomics/atomics.cpp
@@ -134,10 +134,10 @@ int reductionInt(sycl::queue &q, std::vector<int> &data,
 
       h.parallel_for(data_size, [=](auto index) {
         size_t glob_id = index[0];
-        auto v = sycl::ext::oneapi::atomic_ref<
-            int, sycl::ext::oneapi::memory_order::relaxed,
-            sycl::ext::oneapi::memory_scope::device,
-            sycl::access::address_space::global_space>(sum_acc[0]);
+        auto v = sycl::atomic_ref<int, sycl::memory_order::relaxed,
+                                  sycl::memory_scope::device,
+                                  sycl::access::address_space::global_space>(
+            sum_acc[0]);
         v.fetch_add(buf_acc[glob_id]);
       });
     });
@@ -200,10 +200,10 @@ int reductionFloat(sycl::queue &q, std::vector<float> &data,
 
       h.parallel_for(data_size, [=](auto index) {
         size_t glob_id = index[0];
-        auto v = sycl::ext::oneapi::atomic_ref<
-            float, sycl::ext::oneapi::memory_order::relaxed,
-            sycl::ext::oneapi::memory_scope::device,
-            sycl::access::address_space::global_space>(sum_acc[0]);
+        auto v = sycl::atomic_ref<float, sycl::memory_order::relaxed,
+                                  sycl::memory_scope::device,
+                                  sycl::access::address_space::global_space>(
+            sum_acc[0]);
         v.fetch_add(buf_acc[glob_id]);
       });
     });
@@ -225,7 +225,7 @@ int reductionFloat(sycl::queue &q, std::vector<float> &data,
 
 int main(int argc, char *argv[]) {
 
-  sycl::queue q{sycl::default_selector{}, exception_handler};
+  sycl::queue q{sycl::default_selector_v, exception_handler};
   std::cout << q.get_device().get_info<sycl::info::device::name>() << "\n";
   {
     std::vector<int> data(N, 1);
diff --git a/Publications/GPU-Opt-Guide/atomics/global_atomics_ref.cpp b/Publications/GPU-Opt-Guide/atomics/global_atomics_ref.cpp
index d4490b4ff3..84c135d312 100644
--- a/Publications/GPU-Opt-Guide/atomics/global_atomics_ref.cpp
+++ b/Publications/GPU-Opt-Guide/atomics/global_atomics_ref.cpp
@@ -20,10 +20,9 @@ int main() {
     sycl::accessor acc_a(bufa, h, sycl::read_only);
     h.parallel_for(sycl::nd_range<1>(N, M), [=](auto it) {
       auto i = it.get_global_id();
-      sycl::ext::oneapi::atomic_ref<int,
-                                    sycl::ext::oneapi::memory_order_relaxed,
-                                    sycl::ext::oneapi::memory_scope_device,
-                                    sycl::access::address_space::global_space>
+      sycl::atomic_ref<int, sycl::memory_order_relaxed,
+                       sycl::memory_scope_device,
+                       sycl::access::address_space::global_space>
           atomic_op(acc[0]);
       atomic_op += acc_a[i];
     });
diff --git a/Publications/GPU-Opt-Guide/atomics/local_atomics_ref.cpp b/Publications/GPU-Opt-Guide/atomics/local_atomics_ref.cpp
index bffe098da6..a3b674f887 100644
--- a/Publications/GPU-Opt-Guide/atomics/local_atomics_ref.cpp
+++ b/Publications/GPU-Opt-Guide/atomics/local_atomics_ref.cpp
@@ -19,20 +19,17 @@ int main() {
   auto e1 = q.submit([&](sycl::handler &h) {
     sycl::accessor b(global, h);
     sycl::accessor acc_a(bufa, h, sycl::read_only);
-    auto acc = sycl::accessor<int, 1, sycl::access::mode::read_write,
-                              sycl::access::target::local>(NUM_WG, h);
+    auto acc = sycl::local_accessor<int, 1>(NUM_WG, h);
     h.parallel_for(sycl::nd_range<1>(N, M), [=](auto it) {
       auto i = it.get_global_id(0);
       auto group_id = it.get_group(0);
-      sycl::ext::oneapi::atomic_ref<int,
-                                    sycl::ext::oneapi::memory_order_relaxed,
-                                    sycl::ext::oneapi::memory_scope_device,
-                                    sycl::access::address_space::local_space>
+      sycl::atomic_ref<int, sycl::memory_order_relaxed,
+                       sycl::memory_scope_device,
+                       sycl::access::address_space::local_space>
           atomic_op(acc[group_id]);
-      sycl::ext::oneapi::atomic_ref<int,
-                                    sycl::ext::oneapi::memory_order_relaxed,
-                                    sycl::ext::oneapi::memory_scope_device,
-                                    sycl::access::address_space::global_space>
+      sycl::atomic_ref<int, sycl::memory_order_relaxed,
+                       sycl::memory_scope_device,
+                       sycl::access::address_space::global_space>
           atomic_op_global(b[0]);
       atomic_op += acc_a[i];
       it.barrier(sycl::access::fence_space::local_space);
diff --git a/Publications/GPU-Opt-Guide/atomics/test_atomic.cpp b/Publications/GPU-Opt-Guide/atomics/test_atomic.cpp
index dd380abe53..f1ef096630 100644
--- a/Publications/GPU-Opt-Guide/atomics/test_atomic.cpp
+++ b/Publications/GPU-Opt-Guide/atomics/test_atomic.cpp
@@ -9,8 +9,6 @@
 #include <chrono>
 #include <iostream>
 
-sycl::default_selector d_selector;
-
 template <typename T> using VectorAllocator = AlignedAllocator<T>;
 
 template <typename T> using AlignedVector = std::vector<T, VectorAllocator<T>>;
@@ -47,10 +45,10 @@ int VectorInt(sycl::queue &q, int iter) {
       sycl::accessor b_acc(a_buf, h, sycl::read_only);
 
       h.parallel_for(num_items, [=](auto i) {
-        auto v = sycl::ext::oneapi::atomic_ref<
-            int, sycl::ext::oneapi::memory_order::relaxed,
-            sycl::ext::oneapi::memory_scope::device,
-            sycl::access::address_space::global_space>(a_acc[0]);
+        auto v = sycl::atomic_ref<int, sycl::memory_order::relaxed,
+                                  sycl::memory_scope::device,
+                                  sycl::access::address_space::global_space>(
+            a_acc[0]);
         v += b_acc[i];
       });
     });
@@ -85,10 +83,10 @@ int VectorFloat(sycl::queue &q, int iter) {
       sycl::accessor b_acc(a_buf, h, sycl::read_only);
 
       h.parallel_for(num_items, [=](auto i) {
-        auto v = sycl::ext::oneapi::atomic_ref<
-            float, sycl::ext::oneapi::memory_order::relaxed,
-            sycl::ext::oneapi::memory_scope::device,
-            sycl::access::address_space::global_space>(a_acc[0]);
+        auto v = sycl::atomic_ref<float, sycl::memory_order::relaxed,
+                                  sycl::memory_scope::device,
+                                  sycl::access::address_space::global_space>(
+            a_acc[0]);
         v += b_acc[i];
       });
     });
@@ -122,10 +120,10 @@ int VectorDouble(sycl::queue &q, int iter) {
       sycl::accessor b_acc(a_buf, h, sycl::read_only);
 
       h.parallel_for(num_items, [=](auto i) {
-        auto v = sycl::ext::oneapi::atomic_ref<
-            double, sycl::ext::oneapi::memory_order::relaxed,
-            sycl::ext::oneapi::memory_scope::device,
-            sycl::access::address_space::global_space>(a_acc[0]);
+        auto v = sycl::atomic_ref<double, sycl::memory_order::relaxed,
+                                  sycl::memory_scope::device,
+                                  sycl::access::address_space::global_space>(
+            a_acc[0]);
         v += b_acc[i];
       });
     });
@@ -140,7 +138,7 @@ int VectorDouble(sycl::queue &q, int iter) {
 
 int main() {
 
-  sycl::queue q(d_selector);
+  sycl::queue q(sycl::gpu_selector_v);
   VectorAllocator<int> alloc;
   AlignedVector<int> a(array_size, alloc);
   AlignedVector<int> b(array_size, alloc);
diff --git a/Publications/GPU-Opt-Guide/exec-model/vaddsync.cpp b/Publications/GPU-Opt-Guide/exec-model/vaddsync.cpp
index 4579d11c63..012b13b8fe 100644
--- a/Publications/GPU-Opt-Guide/exec-model/vaddsync.cpp
+++ b/Publications/GPU-Opt-Guide/exec-model/vaddsync.cpp
@@ -8,7 +8,7 @@
 #include <chrono>
 #include <iostream>
 
-sycl::default_selector d_selector;
+auto d_selector = sycl::default_selector_v;
 
 // Array type and data size for this example.
 constexpr size_t array_size = 3 * 5 * 7 * (1 << 18);
@@ -41,7 +41,7 @@ int VectorAdd3(sycl::queue &q, const IntArray &a, const IntArray &b,
           size_t loc_id = index.get_local_id();
           size_t start = grp_id * mysize;
           size_t end = start + mysize;
-          for (int i = 0; i < iter; i++)
+          for (int j = 0; j < iter; j++)
             for (size_t i = start + loc_id; i < end; i += wg_size) {
               sum_acc[i] = a_acc[i] + b_acc[i];
             }
@@ -80,7 +80,7 @@ int VectorAdd4(sycl::queue &q, const IntArray &a, const IntArray &b,
           size_t loc_id = index.get_local_id();
           size_t start = grp_id * mysize;
           size_t end = start + mysize;
-          for (int i = 0; i < iter; i++) {
+          for (int j = 0; j < iter; j++) {
             for (size_t i = start + loc_id; i < end; i += wg_size) {
               sum_acc[i] = a_acc[i] + b_acc[i];
             }
diff --git a/Publications/GPU-Opt-Guide/exec-model/vec-add.cpp b/Publications/GPU-Opt-Guide/exec-model/vec-add.cpp
index ef558c3d01..2c8957f0f5 100644
--- a/Publications/GPU-Opt-Guide/exec-model/vec-add.cpp
+++ b/Publications/GPU-Opt-Guide/exec-model/vec-add.cpp
@@ -8,7 +8,7 @@
 #include <chrono>
 #include <iostream>
 
-sycl::default_selector d_selector;
+auto d_selector = sycl::default_selector_v;
 
 // Array type and data size for this example.
 constexpr size_t array_size = 3 * 5 * 7 * (1 << 17);
@@ -70,7 +70,7 @@ int VectorAdd2(sycl::queue &q, const IntArray &a, const IntArray &b,
           size_t loc_id = index.get_local_id();
           size_t start = grp_id * mysize;
           size_t end = start + mysize;
-          for (int i = 0; i < iter; i++)
+          for (int j = 0; j < iter; j++)
             for (size_t i = start + loc_id; i < end; i += wg_size) {
               sum_acc[i] = a_acc[i] + b_acc[i];
             }
diff --git a/Publications/GPU-Opt-Guide/grf-mode-selection/CMakeLists.txt b/Publications/GPU-Opt-Guide/grf-mode-selection/CMakeLists.txt
index 9647261757..d776d83c6b 100644
--- a/Publications/GPU-Opt-Guide/grf-mode-selection/CMakeLists.txt
+++ b/Publications/GPU-Opt-Guide/grf-mode-selection/CMakeLists.txt
@@ -1,3 +1,2 @@
-add_subdirectory(openmp)
 #add_subdirectory(sycl)
 add_subdirectory(perf)
diff --git a/Publications/GPU-Opt-Guide/implicit-scaling/03_stream/stream.cpp b/Publications/GPU-Opt-Guide/implicit-scaling/03_stream/stream.cpp
index d6fa01ee7c..3d6cf6bdbf 100644
--- a/Publications/GPU-Opt-Guide/implicit-scaling/03_stream/stream.cpp
+++ b/Publications/GPU-Opt-Guide/implicit-scaling/03_stream/stream.cpp
@@ -7,6 +7,7 @@
 // Code for STREAM:
 #include <iostream>
 #include <omp.h>
+#include <cstdint>
 
 // compile via:
 // icpx -O2 -fiopenmp -fopenmp-targets=spir64 ./stream.cpp
@@ -14,11 +15,11 @@
 int main()
 {
     constexpr int64_t N = 256 * 1e6;
-    constexpr int64_t bytes = N * sizeof(int);
+    constexpr int64_t bytes = N * sizeof(int64_t);
 
-    int *a = static_cast<int *>(malloc(bytes));
-    int *b = static_cast<int *>(malloc(bytes));
-    int *c = static_cast<int *>(malloc(bytes));
+    int64_t *a = static_cast<int64_t *>(malloc(bytes));
+    int64_t *b = static_cast<int64_t *>(malloc(bytes));
+    int64_t *c = static_cast<int64_t *>(malloc(bytes));
 
     #pragma omp target enter data map(alloc:a[0:N])
     #pragma omp target enter data map(alloc:b[0:N])
@@ -40,8 +41,7 @@ int main()
         if (irep == 10)
             time = omp_get_wtime();
 
-        #pragma omp target teams distribute parallel for \
-            simd simdlen(32) thread_limit(256)
+        #pragma omp target teams distribute parallel for simd
         for (int i = 0; i < N; ++i)
         {
             c[i] = a[i] + b[i];
@@ -61,7 +61,7 @@ int main()
         }
     }
 
-    const int64_t streamed_bytes = 3 * N * sizeof(int);
+    const int64_t streamed_bytes = 3 * N * sizeof(int64_t);
 
     std::cout << "bandwidth = " << (streamed_bytes / time) * 1E-9
         << " GB/s" << std::endl;
diff --git a/Publications/GPU-Opt-Guide/implicit-scaling/04_stream_3D/stream_3D.cpp b/Publications/GPU-Opt-Guide/implicit-scaling/04_stream_3D/stream_3D.cpp
index d68875457c..d4b07d3d1e 100644
--- a/Publications/GPU-Opt-Guide/implicit-scaling/04_stream_3D/stream_3D.cpp
+++ b/Publications/GPU-Opt-Guide/implicit-scaling/04_stream_3D/stream_3D.cpp
@@ -16,14 +16,14 @@ int main(int argc, char **argv)
 {
     const int device_id = omp_get_default_device();
     const int desired_total_size = 32 * 512 * 16384;
-    const std::size_t bytes = desired_total_size * sizeof(int);
+    const std::size_t bytes = desired_total_size * sizeof(int64_t);
 
     std::cout << "memory footprint = " << 3 * bytes * 1E-9 << " GB"
               << std::endl;
 
-    int *a = static_cast<int*>(omp_target_alloc_device(bytes, device_id));
-    int *b = static_cast<int*>(omp_target_alloc_device(bytes, device_id));
-    int *c = static_cast<int*>(omp_target_alloc_device(bytes, device_id));
+    int64_t *a = static_cast<int64_t*>(omp_target_alloc_device(bytes, device_id));
+    int64_t *b = static_cast<int64_t*>(omp_target_alloc_device(bytes, device_id));
+    int64_t *c = static_cast<int64_t*>(omp_target_alloc_device(bytes, device_id));
 
     const int min = 64;
     const int max = 32768;
@@ -56,8 +56,7 @@ int main(int argc, char **argv)
                 {
                     if (irep == warmup) time = omp_get_wtime();
 
-                    #pragma omp target teams distribute parallel for \
-                        simd simdlen(32) thread_limit(64) collapse(3)
+                    #pragma omp target teams distribute parallel for simd collapse(3)
                     for (int iz = 0; iz < lz; ++iz)
                     {
                         for (int iy = 0; iy < ly; ++iy)
@@ -73,7 +72,7 @@ int main(int argc, char **argv)
                 time = omp_get_wtime() - time;
                 time = time / no_max_rep;
 
-                const int64_t streamed_bytes = 3 * total_size * sizeof(int);
+                const int64_t streamed_bytes = 3 * total_size * sizeof(int64_t);
 
                 std::cout << "bandwidth = " << (streamed_bytes / time) * 1E-9
                           << " GB/s" << std::endl;
diff --git a/Publications/GPU-Opt-Guide/implicit-scaling/05_stream_cross_stack/stream_cross_stack.cpp b/Publications/GPU-Opt-Guide/implicit-scaling/05_stream_cross_stack/stream_cross_stack.cpp
new file mode 100644
index 0000000000..834a22397c
--- /dev/null
+++ b/Publications/GPU-Opt-Guide/implicit-scaling/05_stream_cross_stack/stream_cross_stack.cpp
@@ -0,0 +1,95 @@
+//==============================================================
+// Copyright © 2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+// clang-format off
+// Code for cross stack stream
+#include <iostream>
+#include <omp.h>
+
+// compile via:
+// icpx -O2 -fiopenmp -fopenmp-targets=spir64 ./stream_cross_stack.cpp
+// run via:
+// EnableWalkerPartition=1 ZE_AFFINITY_MASK=0 ./a.out
+
+template <int cross_stack_fraction>
+void cross_stack_stream() {
+
+    constexpr int64_t size = 256*1e6;
+    constexpr int64_t bytes = size * sizeof(int64_t);
+
+    int64_t *a = static_cast<int64_t*>(malloc( bytes ));
+    int64_t *b = static_cast<int64_t*>(malloc( bytes ));
+    int64_t *c = static_cast<int64_t*>(malloc( bytes ));
+    #pragma omp target enter data map( alloc:a[0:size] )
+    #pragma omp target enter data map( alloc:b[0:size] )
+    #pragma omp target enter data map( alloc:c[0:size] )
+
+    for ( int i = 0; i < size; ++i ) {
+
+        a[i] = i + 1;
+        b[i] = i - 1;
+        c[i] = 0;
+    }
+
+    #pragma omp target update to( a[0:size] )
+    #pragma omp target update to( b[0:size] )
+    #pragma omp target update to( c[0:size] )
+
+    const int num_max_rep = 100;
+
+    double time;
+
+    for ( int irep = 0; irep < num_max_rep+10; ++irep ) {
+
+        if ( irep == 10 ) time = omp_get_wtime();
+
+        #pragma omp target teams distribute parallel for simd
+        for ( int j = 0; j < size; ++j ) {
+
+            const int cache_line_id = j / 16;
+
+            int i;
+
+            if ( (cache_line_id%cross_stack_fraction) == 0 ) {
+
+                i = (j+size/2)%size;
+            }
+            else {
+
+                i = j;
+            }
+
+            c[i] = a[i] + b[i];
+        }
+    }
+    time = omp_get_wtime() - time;
+    time = time/num_max_rep;
+
+    #pragma omp target update from( c[0:size] )
+
+    for ( int i = 0; i < size; ++i ) {
+
+        if ( c[i] != 2*i ) {
+
+            std::cout << "wrong results!" << std::endl;
+            exit(1);
+        }
+    }
+
+    const int64_t streamed_bytes = 3 * size * sizeof(int64_t);
+
+    std::cout << "cross_stack_percent = " << (1/(double)cross_stack_fraction)*100
+              << "%, bandwidth = " << (streamed_bytes/time) * 1E-9 << " GB/s" << std::endl;
+}
+
+int main() {
+
+    cross_stack_stream< 1>();
+    cross_stack_stream< 2>();
+    cross_stack_stream< 4>();
+    cross_stack_stream< 8>();
+    cross_stack_stream<16>();
+    cross_stack_stream<32>();
+}
diff --git a/Publications/GPU-Opt-Guide/implicit-scaling/06_cross_stack/cross_stack_01.cpp b/Publications/GPU-Opt-Guide/implicit-scaling/06_cross_stack/cross_stack_01.cpp
new file mode 100644
index 0000000000..270bb9eb4f
--- /dev/null
+++ b/Publications/GPU-Opt-Guide/implicit-scaling/06_cross_stack/cross_stack_01.cpp
@@ -0,0 +1,13 @@
+//==============================================================
+// Copyright © 2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+// clang-format off
+// Snippet begin
+#pragma omp target teams distribute parallel for simd
+for (int i = N - 1; i <= 0; --i)
+{
+    c[i] = a[i] + b[i];
+}
+// Snippet end
diff --git a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_buffers.cpp b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_buffers.cpp
index 3f8ce96b18..b484930f32 100644
--- a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_buffers.cpp
+++ b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_buffers.cpp
@@ -3,6 +3,8 @@
 //
 // SPDX-License-Identifier: MIT
 // =============================================================
+
+// Snippet begin
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <mkl.h>
@@ -108,3 +110,4 @@ int main(int argc, char **argv) {
                "normalized correlation score of "
             << max_corr / N << "." << std::endl;
 }
+// Snippet end
diff --git a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm.cpp b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm.cpp
index 39f2314521..541864e085 100644
--- a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm.cpp
+++ b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm.cpp
@@ -3,6 +3,8 @@
 //
 // SPDX-License-Identifier: MIT
 // =============================================================
+
+// Snippet begin
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <mkl.h>
@@ -102,3 +104,4 @@ int main(int argc, char **argv) {
   sycl::free(sig2, sycl_context);
   sycl::free(corr, sycl_context);
 }
+// Snippet end
diff --git a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm_fixed.cpp b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm_fixed.cpp
index 0d59a276ee..ee72cd2cfd 100644
--- a/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm_fixed.cpp
+++ b/Publications/GPU-Opt-Guide/libraries-fcorr/fcorr_1d_usm_fixed.cpp
@@ -3,6 +3,8 @@
 //
 // SPDX-License-Identifier: MIT
 // =============================================================
+
+// Snippet begin
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <mkl.h>
@@ -101,3 +103,4 @@ int main(int argc, char **argv) {
   sycl::free(sig2, sycl_context);
   sycl::free(corr, sycl_context);
 }
+// Snippet end
diff --git a/Publications/GPU-Opt-Guide/libraries-stdlib/external_rand.cpp b/Publications/GPU-Opt-Guide/libraries-stdlib/external_rand.cpp
index e90048b22c..910eeadd12 100644
--- a/Publications/GPU-Opt-Guide/libraries-stdlib/external_rand.cpp
+++ b/Publications/GPU-Opt-Guide/libraries-stdlib/external_rand.cpp
@@ -6,11 +6,12 @@
 // Compile:
 // dpcpp -D{HOST|CPU|GPU} -std=c++17 -fsycl external_rand.cpp -o external_rand
 
+// Snippet begin
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <random>
 
-#define N 5
+constexpr int N = 5;
 
 extern SYCL_EXTERNAL int rand(void);
 
@@ -41,3 +42,4 @@ int main(void) {
   // Cleanup
   sycl::free(test1, Q.get_context());
 }
+// Snippet end
diff --git a/Publications/GPU-Opt-Guide/libraries-stdlib/rng_test.cpp b/Publications/GPU-Opt-Guide/libraries-stdlib/rng_test.cpp
index a26c04b911..e1158356d7 100644
--- a/Publications/GPU-Opt-Guide/libraries-stdlib/rng_test.cpp
+++ b/Publications/GPU-Opt-Guide/libraries-stdlib/rng_test.cpp
@@ -3,6 +3,8 @@
 //
 // SPDX-License-Identifier: MIT
 // =============================================================
+
+// Snippet begin
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <oneapi/dpl/random>
@@ -71,3 +73,4 @@ int main(int argc, char **argv) {
   sycl::free(test1, Q.get_context());
   sycl::free(test2, Q.get_context());
 }
+// Snippet end
diff --git a/Publications/GPU-Opt-Guide/local-global-sync/atomics.cpp b/Publications/GPU-Opt-Guide/local-global-sync/atomics.cpp
index 14a9407334..fb8055f656 100644
--- a/Publications/GPU-Opt-Guide/local-global-sync/atomics.cpp
+++ b/Publications/GPU-Opt-Guide/local-global-sync/atomics.cpp
@@ -129,9 +129,7 @@ int reductionIntBarrier(sycl::queue &q, std::vector<int> &data, int iter) {
     q.submit([&](auto &h) {
       sycl::accessor buf_acc(buf, h, sycl::read_only);
       sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init);
-      sycl::accessor<int, 1, sycl::access::mode::read_write,
-                     sycl::access::target::local>
-          scratch(work_group_size, h);
+      sycl::local_accessor<int, 1> scratch(work_group_size, h);
       h.parallel_for(sycl::nd_range<1>{work_group_size, work_group_size},
                      [=](sycl::nd_item<1> item) {
                        size_t loc_id = item.get_local_id(0);
@@ -194,10 +192,10 @@ int reductionIntAtomic(sycl::queue &q, std::vector<int> &data, int iter) {
 
       h.parallel_for(data_size, [=](auto index) {
         size_t glob_id = index[0];
-        auto v = sycl::ext::oneapi::atomic_ref<
-            int, sycl::ext::oneapi::memory_order::relaxed,
-            sycl::ext::oneapi::memory_scope::device,
-            sycl::access::address_space::global_space>(sum_acc[0]);
+        auto v = sycl::atomic_ref<int, sycl::memory_order::relaxed,
+                                  sycl::memory_scope::device,
+                                  sycl::access::address_space::global_space>(
+            sum_acc[0]);
         v.fetch_add(buf_acc[glob_id]);
       });
     });
@@ -219,7 +217,7 @@ int reductionIntAtomic(sycl::queue &q, std::vector<int> &data, int iter) {
 
 int main(int argc, char *argv[]) {
 
-  sycl::queue q{sycl::default_selector{}, exception_handler};
+  sycl::queue q{sycl::default_selector_v, exception_handler};
   std::cout << q.get_device().get_info<sycl::info::device::name>() << "\n";
   std::vector<int> data(N, 1);
   reductionIntSerial(q, data, 1000);
diff --git a/Publications/GPU-Opt-Guide/registers/block-load-store.cpp b/Publications/GPU-Opt-Guide/registers/block-load-store.cpp
index c74c642ff1..7e07895b9d 100644
--- a/Publications/GPU-Opt-Guide/registers/block-load-store.cpp
+++ b/Publications/GPU-Opt-Guide/registers/block-load-store.cpp
@@ -22,7 +22,7 @@ int main() {
     h.parallel_for(
         sycl::nd_range(sycl::range{N}, sycl::range{32}),
         [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] {
-          sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+          auto sg = it.get_sub_group();
           int x;
 
           using global_ptr =
diff --git a/Publications/GPU-Opt-Guide/registers/histogram256-int-shared-private.cpp b/Publications/GPU-Opt-Guide/registers/histogram256-int-shared-private.cpp
index 77e15211cf..8004370b36 100644
--- a/Publications/GPU-Opt-Guide/registers/histogram256-int-shared-private.cpp
+++ b/Publications/GPU-Opt-Guide/registers/histogram256-int-shared-private.cpp
@@ -30,7 +30,7 @@ int main() {
             << "\n";
 
   // Snippet begin
-  constexpr int blockSize = 256;
+  constexpr int BLOCK_SIZE = 256;
   constexpr int NUM_BINS = 256;
 
   std::vector<unsigned long> hist(NUM_BINS, 0);
@@ -42,11 +42,11 @@ int main() {
     sycl::accessor macc(mbuf, h, sycl::read_only);
     auto hacc = hbuf.get_access<sycl::access::mode::atomic>(h);
     h.parallel_for(
-        sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}),
+        sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}),
         [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] {
           int group = it.get_group()[0];
           int gSize = it.get_local_range()[0];
-          sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+          auto sg = it.get_sub_group();
           int sgSize = sg.get_local_range()[0];
           int sgGroup = sg.get_group_id()[0];
 
@@ -56,10 +56,10 @@ int main() {
           for (int k = 0; k < NUM_BINS / 16; k++) {
             histogram[k] = 0;
           }
-          for (int k = 0; k < blockSize; k++) {
+          for (int k = 0; k < BLOCK_SIZE; k++) {
             unsigned long x =
-                sg.load(macc.get_pointer() + group * gSize * blockSize +
-                        sgGroup * sgSize * blockSize + sgSize * k);
+                sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE +
+                        sgGroup * sgSize * BLOCK_SIZE + sgSize * k);
 // subgroup size is 16
 #pragma unroll
             for (int j = 0; j < 16; j++) {
diff --git a/Publications/GPU-Opt-Guide/registers/histogram256-int.cpp b/Publications/GPU-Opt-Guide/registers/histogram256-int.cpp
index e3532413c2..9ab6f31368 100644
--- a/Publications/GPU-Opt-Guide/registers/histogram256-int.cpp
+++ b/Publications/GPU-Opt-Guide/registers/histogram256-int.cpp
@@ -30,7 +30,7 @@ int main() {
             << "\n";
 
   // Snippet begin
-  constexpr int blockSize = 256;
+  constexpr int BLOCK_SIZE = 256;
   constexpr int NUM_BINS = 256;
 
   std::vector<unsigned long> hist(NUM_BINS, 0);
@@ -42,11 +42,11 @@ int main() {
     sycl::accessor macc(mbuf, h, sycl::read_only);
     auto hacc = hbuf.get_access<sycl::access::mode::atomic>(h);
     h.parallel_for(
-        sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}),
+        sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}),
         [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] {
           int group = it.get_group()[0];
           int gSize = it.get_local_range()[0];
-          sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+          auto sg = it.get_sub_group();
           int sgSize = sg.get_local_range()[0];
           int sgGroup = sg.get_group_id()[0];
 
@@ -56,10 +56,10 @@ int main() {
           for (int k = 0; k < NUM_BINS; k++) {
             histogram[k] = 0;
           }
-          for (int k = 0; k < blockSize; k++) {
+          for (int k = 0; k < BLOCK_SIZE; k++) {
             unsigned long x =
-                sg.load(macc.get_pointer() + group * gSize * blockSize +
-                        sgGroup * sgSize * blockSize + sgSize * k);
+                sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE +
+                        sgGroup * sgSize * BLOCK_SIZE + sgSize * k);
 #pragma unroll
             for (int i = 0; i < 8; i++) {
               unsigned int c = x & 0x1FU;
diff --git a/Publications/GPU-Opt-Guide/registers/histogram32-int-volatile.cpp b/Publications/GPU-Opt-Guide/registers/histogram32-int-volatile.cpp
index 162ac59d59..486982e455 100644
--- a/Publications/GPU-Opt-Guide/registers/histogram32-int-volatile.cpp
+++ b/Publications/GPU-Opt-Guide/registers/histogram32-int-volatile.cpp
@@ -30,7 +30,7 @@ int main() {
             << "\n";
 
   // Snippet begin
-  constexpr int blockSize = 256;
+  constexpr int BLOCK_SIZE = 256;
   constexpr int NUM_BINS = 32;
 
   std::vector<unsigned long> hist(NUM_BINS, 0);
@@ -41,11 +41,11 @@ int main() {
   auto e = q.submit([&](auto &h) {
     sycl::accessor macc(mbuf, h, sycl::read_only);
     auto hacc = hbuf.get_access<sycl::access::mode::atomic>(h);
-    h.parallel_for(sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}),
+    h.parallel_for(sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}),
                    [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] {
                      int group = it.get_group()[0];
                      int gSize = it.get_local_range()[0];
-                     sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+                     auto sg = it.get_sub_group();
                      int sgSize = sg.get_local_range()[0];
                      int sgGroup = sg.get_group_id()[0];
 
@@ -56,10 +56,10 @@ int main() {
                      for (int k = 0; k < NUM_BINS; k++) {
                        histogram[k] = 0;
                      }
-                     for (int k = 0; k < blockSize; k++) {
+                     for (int k = 0; k < BLOCK_SIZE; k++) {
                        unsigned long x = sg.load(
-                           macc.get_pointer() + group * gSize * blockSize +
-                           sgGroup * sgSize * blockSize + sgSize * k);
+                           macc.get_pointer() + group * gSize * BLOCK_SIZE +
+                           sgGroup * sgSize * BLOCK_SIZE + sgSize * k);
 #pragma unroll
                        for (int i = 0; i < 8; i++) {
                          unsigned int c = x & 0x1FU;
diff --git a/Publications/GPU-Opt-Guide/registers/histogram32-int.cpp b/Publications/GPU-Opt-Guide/registers/histogram32-int.cpp
index 0c72b211c8..8312d0e95c 100644
--- a/Publications/GPU-Opt-Guide/registers/histogram32-int.cpp
+++ b/Publications/GPU-Opt-Guide/registers/histogram32-int.cpp
@@ -30,7 +30,7 @@ int main() {
             << "\n";
 
   // Snippet begin
-  constexpr int blockSize = 256;
+  constexpr int BLOCK_SIZE = 256;
   constexpr int NUM_BINS = 32;
 
   std::vector<unsigned long> hist(NUM_BINS, 0);
@@ -42,11 +42,11 @@ int main() {
     sycl::accessor macc(mbuf, h, sycl::read_only);
     auto hacc = hbuf.get_access<sycl::access::mode::atomic>(h);
     h.parallel_for(
-        sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}),
+        sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}),
         [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] {
           int group = it.get_group()[0];
           int gSize = it.get_local_range()[0];
-          sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+          auto sg = it.get_sub_group();
           int sgSize = sg.get_local_range()[0];
           int sgGroup = sg.get_group_id()[0];
 
@@ -55,10 +55,10 @@ int main() {
           for (int k = 0; k < NUM_BINS; k++) {
             histogram[k] = 0;
           }
-          for (int k = 0; k < blockSize; k++) {
+          for (int k = 0; k < BLOCK_SIZE; k++) {
             unsigned long x =
-                sg.load(macc.get_pointer() + group * gSize * blockSize +
-                        sgGroup * sgSize * blockSize + sgSize * k);
+                sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE +
+                        sgGroup * sgSize * BLOCK_SIZE + sgSize * k);
 #pragma unroll
             for (int i = 0; i < 8; i++) {
               unsigned int c = x & 0x1FU;
diff --git a/Publications/GPU-Opt-Guide/registers/histogram32-long.cpp b/Publications/GPU-Opt-Guide/registers/histogram32-long.cpp
index ebf5eb1ad8..842403fe06 100644
--- a/Publications/GPU-Opt-Guide/registers/histogram32-long.cpp
+++ b/Publications/GPU-Opt-Guide/registers/histogram32-long.cpp
@@ -30,7 +30,7 @@ int main() {
             << "\n";
 
   // Snippet begin
-  constexpr int blockSize = 256;
+  constexpr int BLOCK_SIZE = 256;
   constexpr int NUM_BINS = 32;
 
   std::vector<unsigned long> hist(NUM_BINS, 0);
@@ -41,11 +41,11 @@ int main() {
     sycl::accessor macc(mbuf, h, sycl::read_only);
     auto hacc = hbuf.get_access<sycl::access::mode::atomic>(h);
     h.parallel_for(
-        sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}),
+        sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}),
         [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] {
           int group = it.get_group()[0];
           int gSize = it.get_local_range()[0];
-          sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+          auto sg = it.get_sub_group();
           int sgSize = sg.get_local_range()[0];
           int sgGroup = sg.get_group_id()[0];
 
@@ -55,10 +55,10 @@ int main() {
           for (int k = 0; k < NUM_BINS; k++) {
             histogram[k] = 0;
           }
-          for (int k = 0; k < blockSize; k++) {
+          for (int k = 0; k < BLOCK_SIZE; k++) {
             unsigned long x =
-                sg.load(macc.get_pointer() + group * gSize * blockSize +
-                        sgGroup * sgSize * blockSize + sgSize * k);
+                sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE +
+                        sgGroup * sgSize * BLOCK_SIZE + sgSize * k);
 #pragma unroll
             for (int i = 0; i < 8; i++) {
               unsigned int c = x & 0x1FU;
diff --git a/Publications/GPU-Opt-Guide/slm/histogram-slm-1024.cpp b/Publications/GPU-Opt-Guide/slm/histogram-slm-1024.cpp
index 01cbc4e5f7..556e2dfbd7 100644
--- a/Publications/GPU-Opt-Guide/slm/histogram-slm-1024.cpp
+++ b/Publications/GPU-Opt-Guide/slm/histogram-slm-1024.cpp
@@ -27,7 +27,7 @@ int main() {
 
   // Snippet begin
   constexpr int NUM_BINS = 1024;
-  constexpr int blockSize = 256;
+  constexpr int BLOCK_SIZE = 256;
 
   std::vector<unsigned long> hist(NUM_BINS, 0);
   sycl::buffer<unsigned long, 1> mbuf(input.data(), N);
@@ -39,11 +39,11 @@ int main() {
     sycl::local_accessor<unsigned int, 1> local_histogram(sycl::range(NUM_BINS),
                                                           h);
     h.parallel_for(
-        sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}),
+        sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}),
         [=](sycl::nd_item<1> it) {
           int group = it.get_group()[0];
           int gSize = it.get_local_range()[0];
-          sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+          auto sg = it.get_sub_group();
           int sgSize = sg.get_local_range()[0];
           int sgGroup = sg.get_group_id()[0];
 
@@ -66,10 +66,10 @@ int main() {
           }
           it.barrier(sycl::access::fence_space::local_space);
 
-          for (int k = 0; k < blockSize; k++) {
+          for (int k = 0; k < BLOCK_SIZE; k++) {
             unsigned long x =
-                sg.load(macc.get_pointer() + group * gSize * blockSize +
-                        sgGroup * sgSize * blockSize + sgSize * k);
+                sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE +
+                        sgGroup * sgSize * BLOCK_SIZE + sgSize * k);
 #pragma unroll
             for (std::uint8_t shift : {0, 16, 32, 48}) {
               constexpr unsigned long mask = 0x3FFU;
diff --git a/Publications/GPU-Opt-Guide/slm/histogram-slm-256.cpp b/Publications/GPU-Opt-Guide/slm/histogram-slm-256.cpp
index 9a0160cbe0..03e963308f 100644
--- a/Publications/GPU-Opt-Guide/slm/histogram-slm-256.cpp
+++ b/Publications/GPU-Opt-Guide/slm/histogram-slm-256.cpp
@@ -32,7 +32,7 @@ int main() {
 
   // Snippet begin
   constexpr int NUM_BINS = 256;
-  constexpr int blockSize = 256;
+  constexpr int BLOCK_SIZE = 256;
 
   std::vector<unsigned long> hist(NUM_BINS, 0);
   sycl::buffer<unsigned long, 1> mbuf(input.data(), N);
@@ -44,11 +44,11 @@ int main() {
     sycl::local_accessor<unsigned int> local_histogram(sycl::range(NUM_BINS),
                                                        h);
     h.parallel_for(
-        sycl::nd_range(sycl::range{N / blockSize}, sycl::range{64}),
+        sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}),
         [=](sycl::nd_item<1> it) {
           int group = it.get_group()[0];
           int gSize = it.get_local_range()[0];
-          sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+          auto sg = it.get_sub_group();
           int sgSize = sg.get_local_range()[0];
           int sgGroup = sg.get_group_id()[0];
 
@@ -71,10 +71,10 @@ int main() {
           }
           it.barrier(sycl::access::fence_space::local_space);
 
-          for (int k = 0; k < blockSize; k++) {
+          for (int k = 0; k < BLOCK_SIZE; k++) {
             unsigned long x =
-                sg.load(macc.get_pointer() + group * gSize * blockSize +
-                        sgGroup * sgSize * blockSize + sgSize * k);
+                sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE +
+                        sgGroup * sgSize * BLOCK_SIZE + sgSize * k);
 #pragma unroll
             for (std::uint8_t shift : {0, 8, 16, 24, 32, 40, 48, 56}) {
               constexpr unsigned long mask = 0xFFU;
diff --git a/Publications/GPU-Opt-Guide/sub-group/CMakeLists.txt b/Publications/GPU-Opt-Guide/sub-group/CMakeLists.txt
index 62d351a18a..4c774ec665 100644
--- a/Publications/GPU-Opt-Guide/sub-group/CMakeLists.txt
+++ b/Publications/GPU-Opt-Guide/sub-group/CMakeLists.txt
@@ -7,5 +7,6 @@ add_example(sub-group-4)
 add_example(sub-group-5)
 add_example(sub-group-6)
 add_example(sub-group-7)
+add_example(sub-group-8)
 add_example(sub-group-sizes)
 add_example(transpose)
diff --git a/Publications/GPU-Opt-Guide/sub-group/sg-max-size-output.txt b/Publications/GPU-Opt-Guide/sub-group/sg-max-size-output.txt
index 13e736b7e0..ab0f3a5235 100644
--- a/Publications/GPU-Opt-Guide/sub-group/sg-max-size-output.txt
+++ b/Publications/GPU-Opt-Guide/sub-group/sg-max-size-output.txt
@@ -1,7 +1,7 @@
-globalId = 0 sgMaxSize = 8 sgSize = 7 sId = 0 j = 0 k = 7
-globalId = 1 sgMaxSize = 8 sgSize = 7 sId = 1 j = 1 k = 8
-globalId = 2 sgMaxSize = 8 sgSize = 7 sId = 2 j = 2 k = 9
-globalId = 3 sgMaxSize = 8 sgSize = 7 sId = 3 j = 3 k = 10
-globalId = 4 sgMaxSize = 8 sgSize = 7 sId = 4 j = 4 k = 11
-globalId = 5 sgMaxSize = 8 sgSize = 7 sId = 5 j = 5 k = 12
-globalId = 6 sgMaxSize = 8 sgSize = 7 sId = 6 j = 6 k = 13
+globalId = 0 sgMaxSize = 16 sgSize = 7 sId = 0 j = 0 k = 7
+globalId = 1 sgMaxSize = 16 sgSize = 7 sId = 1 j = 1 k = 8
+globalId = 2 sgMaxSize = 16 sgSize = 7 sId = 2 j = 2 k = 9
+globalId = 3 sgMaxSize = 16 sgSize = 7 sId = 3 j = 3 k = 10
+globalId = 4 sgMaxSize = 16 sgSize = 7 sId = 4 j = 4 k = 11
+globalId = 5 sgMaxSize = 16 sgSize = 7 sId = 5 j = 5 k = 12
+globalId = 6 sgMaxSize = 16 sgSize = 7 sId = 6 j = 6 k = 13
diff --git a/Publications/GPU-Opt-Guide/sub-group/sg-max-size.cpp b/Publications/GPU-Opt-Guide/sub-group/sg-max-size.cpp
index 0e31390e97..a277003ff3 100644
--- a/Publications/GPU-Opt-Guide/sub-group/sg-max-size.cpp
+++ b/Publications/GPU-Opt-Guide/sub-group/sg-max-size.cpp
@@ -72,9 +72,9 @@ int main() {
   auto e = q.submit([&](auto &h) {
     sycl::stream out(65536, 128, h);
     h.parallel_for(sycl::nd_range<1>(7, 7),
-                   [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(8)]] {
+                   [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] {
                      int i = it.get_global_linear_id();
-                     sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+                     auto sg = it.get_sub_group();
                      int sgSize = sg.get_local_range()[0];
                      int sgMaxSize = sg.get_max_local_range()[0];
                      int sId = sg.get_local_id()[0];
diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-0.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-0.cpp
index 14583e18f7..fafe8070d4 100644
--- a/Publications/GPU-Opt-Guide/sub-group/sub-group-0.cpp
+++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-0.cpp
@@ -17,7 +17,7 @@ int main() {
                    [=](sycl::nd_item<1> it) {
                      int groupId = it.get_group(0);
                      int globalId = it.get_global_linear_id();
-                     sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+                     auto sg = it.get_sub_group();
                      int sgSize = sg.get_local_range()[0];
                      int sgGroupId = sg.get_group_id()[0];
                      int sgId = sg.get_local_id()[0];
diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-1.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-1.cpp
index 4aefce6bbf..0c9eea3c08 100644
--- a/Publications/GPU-Opt-Guide/sub-group/sub-group-1.cpp
+++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-1.cpp
@@ -17,7 +17,7 @@ int main() {
                    [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(32)]] {
                      int groupId = it.get_group(0);
                      int globalId = it.get_global_linear_id();
-                     sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+                     auto sg = it.get_sub_group();
                      int sgSize = sg.get_local_range()[0];
                      int sgGroupId = sg.get_group_id()[0];
                      int sgId = sg.get_local_id()[0];
diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-3.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-3.cpp
index 9bad3ffdf8..2d15ca038d 100644
--- a/Publications/GPU-Opt-Guide/sub-group/sub-group-3.cpp
+++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-3.cpp
@@ -20,7 +20,7 @@ int main() {
     h.parallel_for(sycl::nd_range(sycl::range{N / 16}, sycl::range{32}),
                    [=](sycl::nd_item<1> it) {
                      int i = it.get_global_linear_id();
-                     sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+                     auto sg = it.get_sub_group();
                      int sgSize = sg.get_local_range()[0];
                      i = (i / sgSize) * sgSize * 16 + (i % sgSize);
                      for (int j = 0; j < sgSize * 16; j += sgSize) {
diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-5.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-5.cpp
index 7d1178b43d..4120ace227 100644
--- a/Publications/GPU-Opt-Guide/sub-group/sub-group-5.cpp
+++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-5.cpp
@@ -23,7 +23,7 @@ int main() {
     h.parallel_for(sycl::nd_range(sycl::range{N / 16}, sycl::range{32}),
                    [=](sycl::nd_item<1> it) {
                      int i = it.get_global_linear_id();
-                     sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+                     auto sg = it.get_sub_group();
                      int sgSize = sg.get_local_range()[0];
                      i = (i / sgSize) * sgSize * 16 + (i % sgSize);
                      for (int j = 0; j < sgSize * 16; j += sgSize) {
diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-6.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-6.cpp
index 6862e13572..1d7dc7c86a 100644
--- a/Publications/GPU-Opt-Guide/sub-group/sub-group-6.cpp
+++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-6.cpp
@@ -22,7 +22,7 @@ int main() {
     h.parallel_for(
         sycl::nd_range(sycl::range{N / 16}, sycl::range{32}),
         [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] {
-          sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+          auto sg = it.get_sub_group();
           sycl::vec<int, 8> x;
 
           using global_ptr =
diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-7.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-7.cpp
index 83da19af88..9575622ff8 100644
--- a/Publications/GPU-Opt-Guide/sub-group/sub-group-7.cpp
+++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-7.cpp
@@ -22,7 +22,7 @@ int main() {
     h.parallel_for(
         sycl::nd_range(sycl::range{N / 16}, sycl::range{32}),
         [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] {
-          sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+          auto sg = it.get_sub_group();
           sycl::vec<int, 4> x;
 
           int base = (it.get_group(0) * 32 +
diff --git a/Publications/GPU-Opt-Guide/sub-group/sub-group-8.cpp b/Publications/GPU-Opt-Guide/sub-group/sub-group-8.cpp
new file mode 100644
index 0000000000..b7f0d6c993
--- /dev/null
+++ b/Publications/GPU-Opt-Guide/sub-group/sub-group-8.cpp
@@ -0,0 +1,49 @@
+//==============================================================
+// Copyright © 2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <iostream>
+
+int main() {
+  sycl::queue q{sycl::gpu_selector_v,
+                sycl::property::queue::enable_profiling{}};
+
+  std::cout << "Device: " << q.get_device().get_info<sycl::info::device::name>()
+            << std::endl;
+
+  // Snippet begin
+  constexpr int N = 1024 * 1024;
+  int *data = sycl::malloc_shared<int>(N, q);
+  int *data2 = sycl::malloc_shared<int>(N, q);
+  memset(data2, 0xFF, sizeof(int) * N);
+
+  auto e = q.submit([&](auto &h) {
+    h.parallel_for(sycl::nd_range(sycl::range{N / 16}, sycl::range{32}),
+                   [=](sycl::nd_item<1> it) {
+                     int i = it.get_global_linear_id();
+                     auto sg = it.get_sub_group();
+                     int sgSize = sg.get_local_range()[0];
+                     i = (i / sgSize) * sgSize * 16 + (i % sgSize) * 4;
+                     for (int j = 0; j < 4; j++) {
+                       sycl::vec<int, 4> x;
+                       sycl::vec<int, 4> *q =
+                           (sycl::vec<int, 4> *)(&(data2[i + j * sgSize * 4]));
+                       x = *q;
+                       sycl::vec<int, 4> *r =
+                           (sycl::vec<int, 4> *)(&(data[i + j * sgSize * 4]));
+                       *r = x;
+                     }
+                   });
+  });
+  // Snippet end
+  q.wait();
+  std::cout << "Kernel time = "
+            << (e.template get_profiling_info<
+                    sycl::info::event_profiling::command_end>() -
+                e.template get_profiling_info<
+                    sycl::info::event_profiling::command_start>())
+            << " ns" << std::endl;
+  return 0;
+}
diff --git a/Publications/GPU-Opt-Guide/sub-group/transpose.cpp b/Publications/GPU-Opt-Guide/sub-group/transpose.cpp
index 11d719534b..7724d63a8c 100644
--- a/Publications/GPU-Opt-Guide/sub-group/transpose.cpp
+++ b/Publications/GPU-Opt-Guide/sub-group/transpose.cpp
@@ -34,44 +34,44 @@ int main() {
   {
 
     // Snippet begin
-    constexpr size_t blockSize = 16;
+    constexpr size_t BLOCK_SIZE = 16;
     sycl::buffer<uint, 2> m(matrix.data(), sycl::range<2>(N, N));
 
     auto e = q.submit([&](auto &h) {
       sycl::accessor marr(m, h);
-      sycl::local_accessor<uint, 2> barr1(sycl::range<2>(blockSize, blockSize),
-                                          h);
-      sycl::local_accessor<uint, 2> barr2(sycl::range<2>(blockSize, blockSize),
-                                          h);
+      sycl::local_accessor<uint, 2> barr1(
+          sycl::range<2>(BLOCK_SIZE, BLOCK_SIZE), h);
+      sycl::local_accessor<uint, 2> barr2(
+          sycl::range<2>(BLOCK_SIZE, BLOCK_SIZE), h);
 
       h.parallel_for(
-          sycl::nd_range<2>(sycl::range<2>(N / blockSize, N),
-                            sycl::range<2>(1, blockSize)),
+          sycl::nd_range<2>(sycl::range<2>(N / BLOCK_SIZE, N),
+                            sycl::range<2>(1, BLOCK_SIZE)),
           [=](sycl::nd_item<2> it) [[intel::reqd_sub_group_size(16)]] {
             int gi = it.get_group(0);
             int gj = it.get_group(1);
 
-            sycl::ext::oneapi::sub_group sg = it.get_sub_group();
+            auto sg = it.get_sub_group();
             uint sgId = sg.get_local_id()[0];
 
-            uint bcol[blockSize];
-            int ai = blockSize * gi;
-            int aj = blockSize * gj;
+            uint bcol[BLOCK_SIZE];
+            int ai = BLOCK_SIZE * gi;
+            int aj = BLOCK_SIZE * gj;
 
-            for (uint k = 0; k < blockSize; k++) {
+            for (uint k = 0; k < BLOCK_SIZE; k++) {
               bcol[k] = sg.load(marr.get_pointer() + (ai + k) * N + aj);
             }
 
-            uint tcol[blockSize];
-            for (uint n = 0; n < blockSize; n++) {
+            uint tcol[BLOCK_SIZE];
+            for (uint n = 0; n < BLOCK_SIZE; n++) {
               if (sgId == n) {
-                for (uint k = 0; k < blockSize; k++) {
+                for (uint k = 0; k < BLOCK_SIZE; k++) {
                   tcol[k] = sg.shuffle(bcol[n], k);
                 }
               }
             }
 
-            for (uint k = 0; k < blockSize; k++) {
+            for (uint k = 0; k < BLOCK_SIZE; k++) {
               sg.store(marr.get_pointer() + (ai + k) * N + aj, tcol[k]);
             }
           });