410 changes: 410 additions & 0 deletions openmp/runtime/test/worksharing/for/kmp_sch_simd_guided.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,410 @@
// RUN: %libomp-compile-and-run
/*
Test for the 'schedule(simd:guided)' clause.
Compiler needs to generate a dynamic dispatching and pass the schedule
value 46 to the OpenMP RTL. Test uses numerous loop parameter combinations.
*/
#include <stdio.h>
#include <omp.h>

#if defined(WIN32) || defined(_WIN32)
#include <windows.h>
#define delay() Sleep(1);
#else
#include <unistd.h>
#define delay() usleep(10);
#endif

// uncomment for debug diagnostics:
//#define DEBUG

#define SIMD_LEN 4

// ---------------------------------------------------------------------------
// Various definitions copied from OpenMP RTL
enum sched {
kmp_sch_static_balanced_chunked = 45,
kmp_sch_guided_simd = 46,
kmp_sch_runtime_simd = 47,
};
typedef unsigned u32;
typedef long long i64;
typedef unsigned long long u64;
typedef struct {
int reserved_1;
int flags;
int reserved_2;
int reserved_3;
char *psource;
} id;

extern int __kmpc_global_thread_num(id*);
extern void __kmpc_barrier(id*, int gtid);
extern void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
extern void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
extern int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
extern int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
// End of definitions copied from OpenMP RTL.
// ---------------------------------------------------------------------------
static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};

// ---------------------------------------------------------------------------
int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) {
int err = 0;
static int volatile loop_sync = 0;
i64 lb; // Chunk lower bound
i64 ub; // Chunk upper bound
i64 st; // Chunk stride
int rc;
int tid = omp_get_thread_num();
int gtid = tid;
int last;
#if DEBUG
printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
(int)sizeof(i64), gtid, tid,
(int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
#endif
// Don't test degenerate cases that should have been discovered by codegen
if (loop_st == 0)
return 0;
if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
return 0;

__kmpc_dispatch_init_8(&loc, gtid, kmp_sch_guided_simd,
loop_lb, loop_ub, loop_st, loop_chunk);
if (tid == 0) {
// Let the master thread handle the chunks alone
int chunk; // No of current chunk
i64 next_lb; // Lower bound of the next chunk
i64 last_ub; // Upper bound of the last processed chunk
u64 cur; // Number of interations in current chunk
u64 max; // Max allowed iterations for current chunk
int undersized = 0;

chunk = 0;
next_lb = loop_lb;
max = (loop_ub - loop_lb) / loop_st + 1;
// The first chunk can consume all iterations
while (__kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st)) {
++ chunk;
#if DEBUG
printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
#endif
// Check if previous chunk (it is not the final chunk) is undersized
if (undersized) {
printf("Error with chunk %d\n", chunk);
err++;
}
// Check lower and upper bounds
if (lb != next_lb) {
printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
err++;
}
if (loop_st > 0) {
if (!(ub <= loop_ub)) {
printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
err++;
}
if (!(lb <= ub)) {
printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
err++;
}
} else {
if (!(ub >= loop_ub)) {
printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
err++;
}
if (!(lb >= ub)) {
printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
err++;
}
}; // if
// Stride should not change
if (!(st == loop_st)) {
printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
err++;
}
cur = (ub - lb) / loop_st + 1;
// Guided scheduling uses FP computations, so current chunk may
// be a bit bigger (+1) than allowed maximum
if (!(cur <= max + 1)) {
printf("Error with iter %d, %d\n", cur, max);
err++;
}
// Update maximum for the next chunk
if (cur < max)
max = cur;
next_lb = ub + loop_st;
last_ub = ub;
undersized = (cur < loop_chunk);
}; // while
// Must have at least one chunk
if (!(chunk > 0)) {
printf("Error with chunk %d\n", chunk);
err++;
}
// Must have the right last iteration index
if (loop_st > 0) {
if (!(last_ub <= loop_ub)) {
printf("Error with last1 %d, %d, ch %d\n",
(int)last_ub, (int)loop_ub, chunk);
err++;
}
if (!(last_ub + loop_st > loop_ub)) {
printf("Error with last2 %d, %d, %d, ch %d\n",
(int)last_ub, (int)loop_st, (int)loop_ub, chunk);
err++;
}
} else {
if (!(last_ub >= loop_ub)) {
printf("Error with last1 %d, %d, ch %d\n",
(int)last_ub, (int)loop_ub, chunk);
err++;
}
if (!(last_ub + loop_st < loop_ub)) {
printf("Error with last2 %d, %d, %d, ch %d\n",
(int)last_ub, (int)loop_st, (int)loop_ub, chunk);
err++;
}
}; // if
// Let non-master threads go
loop_sync = 1;
} else {
int i;
// Workers wait for master thread to finish, then call __kmpc_dispatch_next
for (i = 0; i < 1000000; ++ i) {
if (loop_sync != 0) {
break;
}; // if
}; // for i
while (loop_sync == 0) {
delay();
}; // while
// At this moment we do not have any more chunks -- all the chunks already
// processed by master thread
rc = __kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st);
if (rc) {
printf("Error return value\n");
err++;
}
}; // if

__kmpc_barrier(&loc, gtid);
if (tid == 0) {
loop_sync = 0; // Restore original state
#if DEBUG
printf("run_loop_64(): at the end\n");
#endif
}; // if
__kmpc_barrier(&loc, gtid);
return err;
} // run_loop

// ---------------------------------------------------------------------------
int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) {
int err = 0;
static int volatile loop_sync = 0;
int lb; // Chunk lower bound
int ub; // Chunk upper bound
int st; // Chunk stride
int rc;
int tid = omp_get_thread_num();
int gtid = tid;
int last;
#if DEBUG
printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
(int)sizeof(int), gtid, tid,
(int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
#endif
// Don't test degenerate cases that should have been discovered by codegen
if (loop_st == 0)
return 0;
if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
return 0;

__kmpc_dispatch_init_4(&loc, gtid, kmp_sch_guided_simd,
loop_lb, loop_ub, loop_st, loop_chunk);
if (tid == 0) {
// Let the master thread handle the chunks alone
int chunk; // No of current chunk
int next_lb; // Lower bound of the next chunk
int last_ub; // Upper bound of the last processed chunk
u64 cur; // Number of interations in current chunk
u64 max; // Max allowed iterations for current chunk
int undersized = 0;

chunk = 0;
next_lb = loop_lb;
max = (loop_ub - loop_lb) / loop_st + 1;
// The first chunk can consume all iterations
while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
++ chunk;
#if DEBUG
printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
#endif
// Check if previous chunk (it is not the final chunk) is undersized
if (undersized) {
printf("Error with chunk %d\n", chunk);
err++;
}
// Check lower and upper bounds
if (lb != next_lb) {
printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
err++;
}
if (loop_st > 0) {
if (!(ub <= loop_ub)) {
printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
err++;
}
if (!(lb <= ub)) {
printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
err++;
}
} else {
if (!(ub >= loop_ub)) {
printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
err++;
}
if (!(lb >= ub)) {
printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
err++;
}
}; // if
// Stride should not change
if (!(st == loop_st)) {
printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
err++;
}
cur = (ub - lb) / loop_st + 1;
// Guided scheduling uses FP computations, so current chunk may
// be a bit bigger (+1) than allowed maximum
if (!(cur <= max + 1)) {
printf("Error with iter %d, %d\n", cur, max);
err++;
}
// Update maximum for the next chunk
if (cur < max)
max = cur;
next_lb = ub + loop_st;
last_ub = ub;
undersized = (cur < loop_chunk);
}; // while
// Must have at least one chunk
if (!(chunk > 0)) {
printf("Error with chunk %d\n", chunk);
err++;
}
// Must have the right last iteration index
if (loop_st > 0) {
if (!(last_ub <= loop_ub)) {
printf("Error with last1 %d, %d, ch %d\n",
(int)last_ub, (int)loop_ub, chunk);
err++;
}
if (!(last_ub + loop_st > loop_ub)) {
printf("Error with last2 %d, %d, %d, ch %d\n",
(int)last_ub, (int)loop_st, (int)loop_ub, chunk);
err++;
}
} else {
if (!(last_ub >= loop_ub)) {
printf("Error with last1 %d, %d, ch %d\n",
(int)last_ub, (int)loop_ub, chunk);
err++;
}
if (!(last_ub + loop_st < loop_ub)) {
printf("Error with last2 %d, %d, %d, ch %d\n",
(int)last_ub, (int)loop_st, (int)loop_ub, chunk);
err++;
}
}; // if
// Let non-master threads go
loop_sync = 1;
} else {
int i;
// Workers wait for master thread to finish, then call __kmpc_dispatch_next
for (i = 0; i < 1000000; ++ i) {
if (loop_sync != 0) {
break;
}; // if
}; // for i
while (loop_sync == 0) {
delay();
}; // while
// At this moment we do not have any more chunks -- all the chunks already
// processed by the master thread
rc = __kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st);
if (rc) {
printf("Error return value\n");
err++;
}
}; // if

__kmpc_barrier(&loc, gtid);
if (tid == 0) {
loop_sync = 0; // Restore original state
#if DEBUG
printf("run_loop<>(): at the end\n");
#endif
}; // if
__kmpc_barrier(&loc, gtid);
return err;
} // run_loop

// ---------------------------------------------------------------------------
int run_64(int num_th)
{
int err = 0;
#pragma omp parallel num_threads(num_th)
{
int chunk;
i64 st, lb, ub;
for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
for (st = 1; st <= 3; ++ st) {
for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
err += run_loop_64(lb, ub, st, chunk);
err += run_loop_64(ub, lb, -st, chunk);
}; // for ub
}; // for lb
}; // for st
}; // for chunk
}
return err;
} // run_all

int run_32(int num_th)
{
int err = 0;
#pragma omp parallel num_threads(num_th)
{
int chunk, st, lb, ub;
for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
for (st = 1; st <= 3; ++ st) {
for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
err += run_loop_32(lb, ub, st, chunk);
err += run_loop_32(ub, lb, -st, chunk);
}; // for ub
}; // for lb
}; // for st
}; // for chunk
}
return err;
} // run_all

// ---------------------------------------------------------------------------
int main()
{
int n, err = 0;
for (n = 1; n <= 4; ++ n) {
err += run_32(n);
err += run_64(n);
}; // for n
if (err)
printf("failed with %d errors\n", err);
else
printf("passed\n");
return err;
}
221 changes: 221 additions & 0 deletions openmp/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
// RUN: %libomp-compile-and-run

// The test checks schedule(simd:runtime)
// in combination with omp_set_schedule()
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#if defined(WIN32) || defined(_WIN32)
#include <windows.h>
#define delay() Sleep(1);
#define seten(a,b,c) _putenv_s((a),(b))
#else
#include <unistd.h>
#define delay() usleep(10);
#define seten(a,b,c) setenv((a),(b),(c))
#endif

#define SIMD_LEN 4
int err = 0;

// ---------------------------------------------------------------------------
// Various definitions copied from OpenMP RTL.
enum sched {
kmp_sch_static_balanced_chunked = 45,
kmp_sch_guided_simd = 46,
kmp_sch_runtime_simd = 47,
};
typedef unsigned u32;
typedef long long i64;
typedef unsigned long long u64;
typedef struct {
int reserved_1;
int flags;
int reserved_2;
int reserved_3;
char *psource;
} id;

#ifdef __cplusplus
extern "C" {
#endif
int __kmpc_global_thread_num(id*);
void __kmpc_barrier(id*, int gtid);
void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
#ifdef __cplusplus
} // extern "C"
#endif
// End of definitions copied from OpenMP RTL.
// ---------------------------------------------------------------------------
static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};

// ---------------------------------------------------------------------------
void
run_loop(
int loop_lb, // Loop lower bound.
int loop_ub, // Loop upper bound.
int loop_st, // Loop stride.
int lchunk
) {
static int volatile loop_sync = 0;
int lb; // Chunk lower bound.
int ub; // Chunk upper bound.
int st; // Chunk stride.
int rc;
int tid = omp_get_thread_num();
int gtid = __kmpc_global_thread_num(&loc);
int last;
int tc = (loop_ub - loop_lb) / loop_st + 1;
int ch;
int no_chunk = 0;
if (lchunk == 0) {
no_chunk = 1;
lchunk = 1;
}
ch = lchunk * SIMD_LEN;
#if _DEBUG > 1
printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
#endif
// Don't test degenerate cases that should have been discovered by codegen.
if (loop_st == 0)
return;
if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
return;
__kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
loop_lb, loop_ub, loop_st, SIMD_LEN);
{
// Let the master thread handle the chunks alone.
int chunk; // No of current chunk.
int last_ub; // Upper bound of the last processed chunk.
u64 cur; // Number of interations in current chunk.
u64 max; // Max allowed iterations for current chunk.
int undersized = 0;
last_ub = loop_ub;
chunk = 0;
max = (loop_ub - loop_lb) / loop_st + 1;
// The first chunk can consume all iterations.
while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
++ chunk;
#if _DEBUG
printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
#endif
// Check if previous chunk (it is not the final chunk) is undersized.
if (undersized)
printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
if (loop_st > 0) {
if (!(ub <= loop_ub))
printf("Error with ub %d, %d, ch %d, err %d\n",
(int)ub, (int)loop_ub, chunk, ++err);
if (!(lb <= ub))
printf("Error with bounds %d, %d, %d, err %d\n",
(int)lb, (int)ub, chunk, ++err);
} else {
if (!(ub >= loop_ub))
printf("Error with ub %d, %d, %d, err %d\n",
(int)ub, (int)loop_ub, chunk, ++err);
if (!(lb >= ub))
printf("Error with bounds %d, %d, %d, err %d\n",
(int)lb, (int)ub, chunk, ++err);
}; // if
// Stride should not change.
if (!(st == loop_st))
printf("Error with st %d, %d, ch %d, err %d\n",
(int)st, (int)loop_st, chunk, ++err);
cur = ( ub - lb ) / loop_st + 1;
// Guided scheduling uses FP computations, so current chunk may
// be a bit bigger (+1) than allowed maximum.
if (!( cur <= max + 1))
printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
// Update maximum for the next chunk.
if (last) {
if (!no_chunk && cur > ch)
printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
(int)cur, ch, tid, ++err);
} else {
if (cur % ch)
printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
chunk, (int)cur, ch, tid, ++err);
}
if (cur < max)
max = cur;
last_ub = ub;
undersized = (cur < ch);
#if _DEBUG > 1
if (last)
printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
undersized,cur,ch,tid,ub,lb,loop_st);
#endif
} // while
// Must have the right last iteration index.
if (loop_st > 0) {
if (!(last_ub <= loop_ub))
printf("Error with last1 %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_ub, chunk, ++err);
if (last && !(last_ub + loop_st > loop_ub))
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
} else {
if (!(last_ub >= loop_ub))
printf("Error with last1 %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_ub, chunk, ++err);
if (last && !(last_ub + loop_st < loop_ub))
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
} // if
}
__kmpc_barrier(&loc, gtid);
} // run_loop

int main(int argc, char *argv[])
{
int chunk = 0;
// static (no chunk)
omp_set_schedule(omp_sched_static,0);
#pragma omp parallel// num_threads(num_th)
run_loop(0, 26, 1, chunk);

// auto (chunk should be ignorted)
omp_set_schedule(omp_sched_auto,0);
#pragma omp parallel// num_threads(num_th)
run_loop(0, 26, 1, chunk);

// static,1
chunk = 1;
omp_set_schedule(omp_sched_static,1);
#pragma omp parallel// num_threads(num_th)
run_loop(0, 26, 1, chunk);

// dynamic,1
omp_set_schedule(omp_sched_dynamic,1);
#pragma omp parallel// num_threads(num_th)
run_loop(0, 26, 1, chunk);

// guided,1
omp_set_schedule(omp_sched_guided,1);
#pragma omp parallel// num_threads(num_th)
run_loop(0, 26, 1, chunk);

// dynamic,0 - use default chunk size 1
omp_set_schedule(omp_sched_dynamic,0);
#pragma omp parallel// num_threads(num_th)
run_loop(0, 26, 1, chunk);

// guided,0 - use default chunk size 1
omp_set_schedule(omp_sched_guided,0);
#pragma omp parallel// num_threads(num_th)
run_loop(0, 26, 1, chunk);

if (err) {
printf("failed, err = %d\n", err);
return 1;
} else {
printf("passed\n");
return 0;
}
}
196 changes: 196 additions & 0 deletions openmp/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
// RUN: %libomp-compile
// RUN: env OMP_SCHEDULE=guided %libomp-run
// RUN: env OMP_SCHEDULE=guided,1 %libomp-run 1
// RUN: env OMP_SCHEDULE=guided,2 %libomp-run 2
// RUN: env OMP_SCHEDULE=dynamic %libomp-run
// RUN: env OMP_SCHEDULE=dynamic,1 %libomp-run 1
// RUN: env OMP_SCHEDULE=dynamic,2 %libomp-run 2
// RUN: env OMP_SCHEDULE=auto %libomp-run

// The test checks schedule(simd:runtime)
// in combination with OMP_SCHEDULE=guided[,chunk]
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#if defined(WIN32) || defined(_WIN32)
#include <windows.h>
#define delay() Sleep(1);
#define seten(a,b,c) _putenv_s((a),(b))
#else
#include <unistd.h>
#define delay() usleep(10);
#define seten(a,b,c) setenv((a),(b),(c))
#endif

#define UBOUND 100
#define SIMD_LEN 4
int err = 0;

// ---------------------------------------------------------------------------
// Various definitions copied from OpenMP RTL.
enum sched {
kmp_sch_static_balanced_chunked = 45,
kmp_sch_guided_simd = 46,
kmp_sch_runtime_simd = 47,
};
typedef unsigned u32;
typedef long long i64;
typedef unsigned long long u64;
typedef struct {
int reserved_1;
int flags;
int reserved_2;
int reserved_3;
char *psource;
} id;

#ifdef __cplusplus
extern "C" {
#endif
int __kmpc_global_thread_num(id*);
void __kmpc_barrier(id*, int gtid);
void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
#ifdef __cplusplus
} // extern "C"
#endif
// End of definitions copied from OpenMP RTL.
// ---------------------------------------------------------------------------
static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};

// ---------------------------------------------------------------------------
void
run_loop(
int loop_lb, // Loop lower bound.
int loop_ub, // Loop upper bound.
int loop_st, // Loop stride.
int lchunk
) {
static int volatile loop_sync = 0;
int lb; // Chunk lower bound.
int ub; // Chunk upper bound.
int st; // Chunk stride.
int rc;
int tid = omp_get_thread_num();
int gtid = __kmpc_global_thread_num(&loc);
int last;
int tc = (loop_ub - loop_lb) / loop_st + 1;
int ch;
int no_chunk = 0;
if (lchunk == 0) {
no_chunk = 1;
lchunk = 1;
}
ch = lchunk * SIMD_LEN;
#if _DEBUG > 1
printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
#endif
// Don't test degenerate cases that should have been discovered by codegen.
if (loop_st == 0)
return;
if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
return;
__kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
loop_lb, loop_ub, loop_st, SIMD_LEN);
{
// Let the master thread handle the chunks alone.
int chunk; // No of current chunk.
int last_ub; // Upper bound of the last processed chunk.
u64 cur; // Number of interations in current chunk.
u64 max; // Max allowed iterations for current chunk.
int undersized = 0;
last_ub = loop_ub;
chunk = 0;
max = (loop_ub - loop_lb) / loop_st + 1;
// The first chunk can consume all iterations.
while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
++ chunk;
#if _DEBUG
printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
#endif
// Check if previous chunk (it is not the final chunk) is undersized.
if (undersized)
printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
if (loop_st > 0) {
if (!(ub <= loop_ub))
printf("Error with ub %d, %d, ch %d, err %d\n",
(int)ub, (int)loop_ub, chunk, ++err);
if (!(lb <= ub))
printf("Error with bounds %d, %d, %d, err %d\n",
(int)lb, (int)ub, chunk, ++err);
} else {
if (!(ub >= loop_ub))
printf("Error with ub %d, %d, %d, err %d\n",
(int)ub, (int)loop_ub, chunk, ++err);
if (!(lb >= ub))
printf("Error with bounds %d, %d, %d, err %d\n",
(int)lb, (int)ub, chunk, ++err);
}; // if
// Stride should not change.
if (!(st == loop_st))
printf("Error with st %d, %d, ch %d, err %d\n",
(int)st, (int)loop_st, chunk, ++err);
cur = ( ub - lb ) / loop_st + 1;
// Guided scheduling uses FP computations, so current chunk may
// be a bit bigger (+1) than allowed maximum.
if (!( cur <= max + 1))
printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
// Update maximum for the next chunk.
if (!last && cur % ch)
printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
chunk, (int)cur, ch, tid, ++err);
if (last && !no_chunk && cur > ch)
printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
(int)cur, ch, tid, ++err);
if (cur < max)
max = cur;
last_ub = ub;
undersized = (cur < ch);
#if _DEBUG > 1
if (last)
printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
undersized,cur,ch,tid,ub,lb,loop_st);
#endif
} // while
// Must have the right last iteration index.
if (loop_st > 0) {
if (!(last_ub <= loop_ub))
printf("Error with last1 %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_ub, chunk, ++err);
if (last && !(last_ub + loop_st > loop_ub))
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
} else {
if (!(last_ub >= loop_ub))
printf("Error with last1 %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_ub, chunk, ++err);
if (last && !(last_ub + loop_st < loop_ub))
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
} // if
}
__kmpc_barrier(&loc, gtid);
} // run_loop

int main(int argc, char *argv[])
{
int chunk = 0;
if (argc > 1) {
// expect chunk size as a parameter
chunk = atoi(argv[1]);
}
#pragma omp parallel //num_threads(num_th)
run_loop(0, UBOUND, 1, chunk);
if (err) {
printf("failed, err = %d\n", err);
return 1;
} else {
printf("passed\n");
return 0;
}
}
201 changes: 201 additions & 0 deletions openmp/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
// RUN: %libomp-compile && %libomp-run
// RUN: %libomp-run 1 && %libomp-run 2

// The test checks schedule(simd:runtime)
// in combination with OMP_SCHEDULE=static[,chunk]
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#if defined(WIN32) || defined(_WIN32)
#include <windows.h>
#define delay() Sleep(1);
#define seten(a,b,c) _putenv_s((a),(b))
#else
#include <unistd.h>
#define delay() usleep(10);
#define seten(a,b,c) setenv((a),(b),(c))
#endif

#define SIMD_LEN 4
int err = 0;

// ---------------------------------------------------------------------------
// Various definitions copied from OpenMP RTL.
enum sched {
kmp_sch_static_balanced_chunked = 45,
kmp_sch_guided_simd = 46,
kmp_sch_runtime_simd = 47,
};
typedef unsigned u32;
typedef long long i64;
typedef unsigned long long u64;
typedef struct {
int reserved_1;
int flags;
int reserved_2;
int reserved_3;
char *psource;
} id;

#ifdef __cplusplus
extern "C" {
#endif
int __kmpc_global_thread_num(id*);
void __kmpc_barrier(id*, int gtid);
void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
#ifdef __cplusplus
} // extern "C"
#endif
// End of definitions copied from OpenMP RTL.
// ---------------------------------------------------------------------------
static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};

// ---------------------------------------------------------------------------
void
run_loop(
int loop_lb, // Loop lower bound.
int loop_ub, // Loop upper bound.
int loop_st, // Loop stride.
int lchunk
) {
static int volatile loop_sync = 0;
int lb; // Chunk lower bound.
int ub; // Chunk upper bound.
int st; // Chunk stride.
int rc;
int tid = omp_get_thread_num();
int gtid = __kmpc_global_thread_num(&loc);
int last;
int tc = (loop_ub - loop_lb) / loop_st + 1;
int ch;
int no_chunk = 0;
if (lchunk == 0) {
no_chunk = 1;
lchunk = 1;
}
ch = lchunk * SIMD_LEN;
#if _DEBUG > 1
printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
#endif
// Don't test degenerate cases that should have been discovered by codegen.
if (loop_st == 0)
return;
if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
return;
__kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
loop_lb, loop_ub, loop_st, SIMD_LEN);
{
// Let the master thread handle the chunks alone.
int chunk; // No of current chunk.
int last_ub; // Upper bound of the last processed chunk.
u64 cur; // Number of interations in current chunk.
u64 max; // Max allowed iterations for current chunk.
int undersized = 0;
last_ub = loop_ub;
chunk = 0;
max = (loop_ub - loop_lb) / loop_st + 1;
// The first chunk can consume all iterations.
while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
++ chunk;
#if _DEBUG
printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
#endif
// Check if previous chunk (it is not the final chunk) is undersized.
if (undersized)
printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
if (loop_st > 0) {
if (!(ub <= loop_ub))
printf("Error with ub %d, %d, ch %d, err %d\n",
(int)ub, (int)loop_ub, chunk, ++err);
if (!(lb <= ub))
printf("Error with bounds %d, %d, %d, err %d\n",
(int)lb, (int)ub, chunk, ++err);
} else {
if (!(ub >= loop_ub))
printf("Error with ub %d, %d, %d, err %d\n",
(int)ub, (int)loop_ub, chunk, ++err);
if (!(lb >= ub))
printf("Error with bounds %d, %d, %d, err %d\n",
(int)lb, (int)ub, chunk, ++err);
}; // if
// Stride should not change.
if (!(st == loop_st))
printf("Error with st %d, %d, ch %d, err %d\n",
(int)st, (int)loop_st, chunk, ++err);
cur = ( ub - lb ) / loop_st + 1;
// Guided scheduling uses FP computations, so current chunk may
// be a bit bigger (+1) than allowed maximum.
if (!( cur <= max + 1))
printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
// Update maximum for the next chunk.
if (last) {
if (!no_chunk && cur > ch)
printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
(int)cur, ch, tid, ++err);
} else {
if (cur % ch)
printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
chunk, (int)cur, ch, tid, ++err);
}
if (cur < max)
max = cur;
last_ub = ub;
undersized = (cur < ch);
#if _DEBUG > 1
if (last)
printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
undersized,cur,ch,tid,ub,lb,loop_st);
#endif
} // while
// Must have the right last iteration index.
if (loop_st > 0) {
if (!(last_ub <= loop_ub))
printf("Error with last1 %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_ub, chunk, ++err);
if (last && !(last_ub + loop_st > loop_ub))
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
} else {
if (!(last_ub >= loop_ub))
printf("Error with last1 %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_ub, chunk, ++err);
if (last && !(last_ub + loop_st < loop_ub))
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
} // if
}
__kmpc_barrier(&loc, gtid);
} // run_loop

int main(int argc, char *argv[])
{
int chunk = 0;
if (argc > 1) {
char *buf = malloc(8 + strlen(argv[1]));
// expect chunk size as a parameter
chunk = atoi(argv[1]);
strcpy(buf,"static,");
strcat(buf,argv[1]);
seten("OMP_SCHEDULE",buf,1);
printf("Testing schedule(simd:%s)\n", buf);
free(buf);
} else {
seten("OMP_SCHEDULE","static",1);
printf("Testing schedule(simd:static)\n");
}
#pragma omp parallel// num_threads(num_th)
run_loop(0, 26, 1, chunk);
if (err) {
printf("failed, err = %d\n", err);
return 1;
} else {
printf("passed\n");
return 0;
}
}