Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
add_example(test_target_no_nowait)
add_example(test_target_nowait)
add_fortran_example(test_target_nowait_f)
add_example(cpu_only)
add_example(gpu_only)
add_example(hybrid_blocking)
add_example(hybrid_non_blocking)
add_fortran_example(hybrid_non_blocking_f)
51 changes: 51 additions & 0 deletions Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/cpu_only.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

#define N_A 1000000
#define N_B 500000
#define NUM_ITERATIONS 10000

int main() {
float *a = static_cast<float *>(malloc(N_A * sizeof(float)));
float *b = static_cast<float *>(malloc(N_B * sizeof(float)));
float *res = static_cast<float *>(malloc(N_A * sizeof(float)));

// Initialize a and b
for (int i = 0; i < N_A; ++i)
a[i] = i * 0.0001f;
for (int i = 0; i < N_B; ++i)
b[i] = (i % 1000) * 0.001f;

float sum = 0;
int count = 0;
double start, end;

start = omp_get_wtime();
for (int j = 0; j < NUM_ITERATIONS; ++j) {
// Snippet1 begin
#pragma omp parallel for
for (int i = 0; i < N_A; ++i) {
float acc = a[i];
for (int k = 0; k < 20; ++k)
acc = sinf(acc) * expf(acc) + acc * 1.01f;
res[i] = acc;
}
// Snippet1 end
// Snippet2 begin
for (int i = 0; i < N_B; ++i) {
float val = b[i];
if (val > 0.5f)
count++;
sum += val * 0.1f;
}
}
// Snippet2 end
end = omp_get_wtime();
printf("CPU only time: %f seconds.\n", end - start);
free(a);
free(b);
free(res);
return 0;
}
56 changes: 56 additions & 0 deletions Publications/GPU-Opt-Guide/OpenMP/04_target_nowait/gpu_only.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

#define N_A 1000000
#define N_B 500000
#define NUM_ITERATIONS 10000

int main() {
float *a = static_cast<float *>(malloc(N_A * sizeof(float)));
float *b = static_cast<float *>(malloc(N_B * sizeof(float)));
float *res = static_cast<float *>(malloc(N_A * sizeof(float)));

// Initialize a and b
for (int i = 0; i < N_A; ++i)
a[i] = i * 0.0001f;
for (int i = 0; i < N_B; ++i)
b[i] = (i % 1000) * 0.001f;

float sum = 0;
int count = 0;
double start, end;

// Dummy target region to warm up GPU
#pragma omp target
{
;
}

start = omp_get_wtime();
for (int j = 0; j < NUM_ITERATIONS; ++j) {
#pragma omp target teams distribute parallel for map(to : a[0 : N_A]) \
map(from : res[0 : N_A])
for (int i = 0; i < N_A; ++i) {
float acc = a[i];
for (int k = 0; k < 20; ++k)
acc = sinf(acc) * expf(acc) + acc * 1.01f;
res[i] = acc;
}
#pragma omp target teams distribute parallel for map(to : b[0 : N_B]) \
reduction(+ : sum, count)
for (int i = 0; i < N_B; ++i) {
float val = b[i];
if (val > 0.5f)
count++;
sum += val * 0.1f;
}
}
end = omp_get_wtime();
printf("GPU only time: %f seconds.\n", end - start);
free(a);
free(b);
free(res);
return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

#define N_A 1000000
#define N_B 500000
#define NUM_ITERATIONS 10000

int main() {
float *a = static_cast<float *>(malloc(N_A * sizeof(float)));
float *b = static_cast<float *>(malloc(N_B * sizeof(float)));
float *res = static_cast<float *>(malloc(N_A * sizeof(float)));

// Initialize a and b
for (int i = 0; i < N_A; ++i)
a[i] = i * 0.0001f;
for (int i = 0; i < N_B; ++i)
b[i] = (i % 1000) * 0.001f;

float sum = 0;
int count = 0;
double start, mid, end;

// Dummy target region to warm up GPU
#pragma omp target
{
;
}

start = omp_get_wtime();
for (int j = 0; j < NUM_ITERATIONS; ++j) {
#pragma omp target teams distribute parallel for map(to : a[0 : N_A]) \
map(from : res[0 : N_A])
for (int i = 0; i < N_A; ++i) {
float acc = a[i];
for (int k = 0; k < 20; ++k)
acc = sinf(acc) * expf(acc) + acc * 1.01f;
res[i] = acc;
}
for (int i = 0; i < N_B; ++i) {
float val = b[i];
if (val > 0.5f)
count++;
sum += val * 0.1f;
}
}
end = omp_get_wtime();
printf("Hybrid Blocking: %f seconds\n", end - start);
free(a);
free(b);
free(res);
return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

#define N_A 1000000
#define N_B 500000
#define NUM_ITERATIONS 10000

int main() {
float *a = static_cast<float *>(malloc(N_A * sizeof(float)));
float *b = static_cast<float *>(malloc(N_B * sizeof(float)));
float *res = static_cast<float *>(malloc(N_A * sizeof(float)));

// Initialize a and b
for (int i = 0; i < N_A; ++i)
a[i] = i * 0.0001f;
for (int i = 0; i < N_B; ++i)
b[i] = (i % 1000) * 0.001f;

float sum = 0;
int count = 0;
double start, mid, end;

// Dummy target region to warm up GPU
#pragma omp target
{
;
}

start = omp_get_wtime();
for (int j = 0; j < NUM_ITERATIONS; ++j) {
#pragma omp target teams distribute parallel for nowait map(to : a[0 : N_A]) \
map(from : res[0 : N_A])
for (int i = 0; i < N_A; ++i) {
float acc = a[i];
for (int k = 0; k < 20; ++k)
acc = sinf(acc) * expf(acc) + acc * 1.01f;
res[i] = acc;
}
for (int i = 0; i < N_B; ++i) {
float val = b[i];
if (val > 0.5f)
count++;
sum += val * 0.1f;
}
#pragma omp taskwait
}
end = omp_get_wtime();
printf("Hybrid Non-blocking: %f seconds\n", end - start);
free(a);
free(b);
free(res);
return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
program hybrid_non_blocking
use omp_lib
implicit none

integer, parameter :: N_A = 1000000, N_B = 500000, NUM_ITERATIONS = 10000
real(4), allocatable :: a(:), b(:), res(:)
real(4) :: sum
integer :: count, i, j
real(8) :: start, mid, end

allocate(a(N_A), b(N_B), res(N_A))
sum = 0.0
count = 0

do i = 1, N_A
a(i) = i * 0.0001
end do

do i = 1, N_B
b(i) = mod(i, 1000) * 0.001
end do

! Dummy target region to warm up GPU
!$omp target
!$omp end target

call omp_set_default_device(0)
call cpu_time(start)

do j = 1, NUM_ITERATIONS
!$omp target teams distribute parallel do nowait map(to: a(1:N_A)) map(from: res(1:N_A))
do i = 1, N_A
res(i) = a(i)
res(i) = res(i) + sin(res(i)) * exp(res(i))
res(i) = res(i) * 1.01
end do

do i = 1, N_B
if (b(i) > 0.5) then
count = count + 1
end if
sum = sum + b(i) * 0.1
end do

!$omp taskwait
end do

call cpu_time(end)

print *, "Hybrid Non-blocking: ", end - start, " seconds"
deallocate(a, b, res)

end program hybrid_non_blocking
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"description": "oneAPI GPU Optimization Guide Examples",
"toolchain": [
"dpcpp",
"ifx"
"ifort"
],
"languages": [
{
Expand Down

This file was deleted.

Loading