oneapi-src · jimmytwei · May 28, 2025 · May 27, 2025
diff --git a/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/CMakeLists.txt b/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/CMakeLists.txt
@@ -1,3 +1,5 @@
-add_example(test_target_no_nowait)
-add_example(test_target_nowait)
-add_fortran_example(test_target_nowait_f)
+add_example(cpu_only)
+add_example(gpu_only)
+add_example(hybrid_blocking)
+add_example(hybrid_non_blocking)
+add_fortran_example(hybrid_non_blocking_f)
diff --git a/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/cpu_only.cpp b/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/cpu_only.cpp
@@ -0,0 +1,51 @@
+#include <math.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N_A 1000000
+#define N_B 500000
+#define NUM_ITERATIONS 10000
+
+int main() {
+  float *a = static_cast<float *>(malloc(N_A * sizeof(float)));
+  float *b = static_cast<float *>(malloc(N_B * sizeof(float)));
+  float *res = static_cast<float *>(malloc(N_A * sizeof(float)));
+
+  // Initialize a and b
+  for (int i = 0; i < N_A; ++i)
+    a[i] = i * 0.0001f;
+  for (int i = 0; i < N_B; ++i)
+    b[i] = (i % 1000) * 0.001f;
+
+  float sum = 0;
+  int count = 0;
+  double start, end;
+
+  start = omp_get_wtime();
+  for (int j = 0; j < NUM_ITERATIONS; ++j) {
+// Snippet1 begin
+#pragma omp parallel for
+    for (int i = 0; i < N_A; ++i) {
+      float acc = a[i];
+      for (int k = 0; k < 20; ++k)
+        acc = sinf(acc) * expf(acc) + acc * 1.01f;
+      res[i] = acc;
+    }
+    // Snippet1 end
+    // Snippet2 begin
+    for (int i = 0; i < N_B; ++i) {
+      float val = b[i];
+      if (val > 0.5f)
+        count++;
+      sum += val * 0.1f;
+    }
+  }
+  // Snippet2 end
+  end = omp_get_wtime();
+  printf("CPU only time: %f seconds.\n", end - start);
+  free(a);
+  free(b);
+  free(res);
+  return 0;
+}
diff --git a/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/gpu_only.cpp b/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/gpu_only.cpp
@@ -0,0 +1,56 @@
+#include <math.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N_A 1000000
+#define N_B 500000
+#define NUM_ITERATIONS 10000
+
+int main() {
+  float *a = static_cast<float *>(malloc(N_A * sizeof(float)));
+  float *b = static_cast<float *>(malloc(N_B * sizeof(float)));
+  float *res = static_cast<float *>(malloc(N_A * sizeof(float)));
+
+  // Initialize a and b
+  for (int i = 0; i < N_A; ++i)
+    a[i] = i * 0.0001f;
+  for (int i = 0; i < N_B; ++i)
+    b[i] = (i % 1000) * 0.001f;
+
+  float sum = 0;
+  int count = 0;
+  double start, end;
+
+// Dummy target region to warm up GPU
+#pragma omp target
+  {
+    ;
+  }
+
+  start = omp_get_wtime();
+  for (int j = 0; j < NUM_ITERATIONS; ++j) {
+#pragma omp target teams distribute parallel for map(to : a[0 : N_A])          \
+    map(from : res[0 : N_A])
+    for (int i = 0; i < N_A; ++i) {
+      float acc = a[i];
+      for (int k = 0; k < 20; ++k)
+        acc = sinf(acc) * expf(acc) + acc * 1.01f;
+      res[i] = acc;
+    }
+#pragma omp target teams distribute parallel for map(to : b[0 : N_B])          \
+    reduction(+ : sum, count)
+    for (int i = 0; i < N_B; ++i) {
+      float val = b[i];
+      if (val > 0.5f)
+        count++;
+      sum += val * 0.1f;
+    }
+  }
+  end = omp_get_wtime();
+  printf("GPU only time: %f seconds.\n", end - start);
+  free(a);
+  free(b);
+  free(res);
+  return 0;
+}
diff --git a/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/hybrid_blocking.cpp b/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/hybrid_blocking.cpp
@@ -0,0 +1,54 @@
+#include <math.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N_A 1000000
+#define N_B 500000
+#define NUM_ITERATIONS 10000
+
+int main() {
+  float *a = static_cast<float *>(malloc(N_A * sizeof(float)));
+  float *b = static_cast<float *>(malloc(N_B * sizeof(float)));
+  float *res = static_cast<float *>(malloc(N_A * sizeof(float)));
+
+  // Initialize a and b
+  for (int i = 0; i < N_A; ++i)
+    a[i] = i * 0.0001f;
+  for (int i = 0; i < N_B; ++i)
+    b[i] = (i % 1000) * 0.001f;
+
+  float sum = 0;
+  int count = 0;
+  double start, mid, end;
+
+// Dummy target region to warm up GPU
+#pragma omp target
+  {
+    ;
+  }
+
+  start = omp_get_wtime();
+  for (int j = 0; j < NUM_ITERATIONS; ++j) {
+#pragma omp target teams distribute parallel for map(to : a[0 : N_A])          \
+    map(from : res[0 : N_A])
+    for (int i = 0; i < N_A; ++i) {
+      float acc = a[i];
+      for (int k = 0; k < 20; ++k)
+        acc = sinf(acc) * expf(acc) + acc * 1.01f;
+      res[i] = acc;
+    }
+    for (int i = 0; i < N_B; ++i) {
+      float val = b[i];
+      if (val > 0.5f)
+        count++;
+      sum += val * 0.1f;
+    }
+  }
+  end = omp_get_wtime();
+  printf("Hybrid Blocking: %f seconds\n", end - start);
+  free(a);
+  free(b);
+  free(res);
+  return 0;
+}
diff --git a/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/hybrid_non_blocking.cpp b/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/hybrid_non_blocking.cpp
@@ -0,0 +1,55 @@
+#include <math.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N_A 1000000
+#define N_B 500000
+#define NUM_ITERATIONS 10000
+
+int main() {
+  float *a = static_cast<float *>(malloc(N_A * sizeof(float)));
+  float *b = static_cast<float *>(malloc(N_B * sizeof(float)));
+  float *res = static_cast<float *>(malloc(N_A * sizeof(float)));
+
+  // Initialize a and b
+  for (int i = 0; i < N_A; ++i)
+    a[i] = i * 0.0001f;
+  for (int i = 0; i < N_B; ++i)
+    b[i] = (i % 1000) * 0.001f;
+
+  float sum = 0;
+  int count = 0;
+  double start, mid, end;
+
+// Dummy target region to warm up GPU
+#pragma omp target
+  {
+    ;
+  }
+
+  start = omp_get_wtime();
+  for (int j = 0; j < NUM_ITERATIONS; ++j) {
+#pragma omp target teams distribute parallel for nowait map(to : a[0 : N_A])   \
+    map(from : res[0 : N_A])
+    for (int i = 0; i < N_A; ++i) {
+      float acc = a[i];
+      for (int k = 0; k < 20; ++k)
+        acc = sinf(acc) * expf(acc) + acc * 1.01f;
+      res[i] = acc;
+    }
+    for (int i = 0; i < N_B; ++i) {
+      float val = b[i];
+      if (val > 0.5f)
+        count++;
+      sum += val * 0.1f;
+    }
+#pragma omp taskwait
+  }
+  end = omp_get_wtime();
+  printf("Hybrid Non-blocking: %f seconds\n", end - start);
+  free(a);
+  free(b);
+  free(res);
+  return 0;
+}
diff --git a/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/hybrid_non_blocking_f.f90 b/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/hybrid_non_blocking_f.f90
@@ -0,0 +1,53 @@
+program hybrid_non_blocking
+  use omp_lib
+  implicit none
+
+  integer, parameter :: N_A = 1000000, N_B = 500000, NUM_ITERATIONS = 10000
+  real(4), allocatable :: a(:), b(:), res(:)
+  real(4) :: sum
+  integer :: count, i, j
+  real(8) :: start, mid, end
+
+  allocate(a(N_A), b(N_B), res(N_A))
+  sum = 0.0
+  count = 0
+
+  do i = 1, N_A
+    a(i) = i * 0.0001
+  end do
+
+  do i = 1, N_B
+    b(i) = mod(i, 1000) * 0.001
+  end do
+
+  ! Dummy target region to warm up GPU
+  !$omp target
+  !$omp end target
+
+  call omp_set_default_device(0)
+  call cpu_time(start)
+
+  do j = 1, NUM_ITERATIONS
+    !$omp target teams distribute parallel do nowait map(to: a(1:N_A)) map(from: res(1:N_A))
+    do i = 1, N_A
+      res(i) = a(i)
+      res(i) = res(i) + sin(res(i)) * exp(res(i))
+      res(i) = res(i) * 1.01
+    end do
+
+    do i = 1, N_B
+      if (b(i) > 0.5) then
+        count = count + 1
+      end if
+      sum = sum + b(i) * 0.1
+    end do
+
+    !$omp taskwait
+  end do
+
+  call cpu_time(end)
+
+  print *, "Hybrid Non-blocking: ", end - start, " seconds"
+  deallocate(a, b, res)
+
+end program hybrid_non_blocking
diff --git a/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/sample.json b/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/sample.json
@@ -7,7 +7,7 @@
   "description": "oneAPI GPU Optimization Guide Examples",
   "toolchain": [
     "dpcpp",
-    "ifx"
+    "ifort"
   ],
   "languages": [
     {

diff --git a/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/test_target_no_nowait.cpp b/Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/test_target_no_nowait.cpp