Skip to content

Commit

Permalink
OMP chunk template returns the number of chunks used
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Oct 23, 2018
1 parent 41b9aaf commit 9ba351a
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 9 deletions.
9 changes: 5 additions & 4 deletions laser/openmp/omp_parallel.nim
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ template omp_parallel_for_default*(
body)

template omp_parallel_chunks*(
length: Natural,
length: Natural, nb_chunks: var Natural,
chunk_id, chunk_offset, chunk_size: untyped,
omp_threshold: static Natural,
omp_grain_size: static Positive,
Expand All @@ -132,12 +132,13 @@ template omp_parallel_chunks*(
## or when operating on (contiguous) ranges for example for memset or memcpy

when not defined(openmp):
nb_chunks = 1
const `chunk_offset`{.inject.} = 0
let `chunk_size`{.inject.} = length
block: body
else:
let ompsize = length # If length is the result of a proc, call the proc only once
let nb_chunks = if omp_threshold < ompsize:
nb_chunks = if omp_threshold < ompsize:
min(
omp_get_max_threads(),
max(1, ompsize div omp_grain_size) # if ompsize < omp_grain_size
Expand All @@ -159,7 +160,7 @@ template omp_parallel_chunks*(
block: body

template omp_parallel_chunks_default*(
length: Natural,
length: Natural, nb_chunks: var Natural,
chunk_id, chunk_offset, chunk_size: untyped,
body: untyped): untyped =
## This will be renamed omp_parallel_chunks once
Expand All @@ -176,7 +177,7 @@ template omp_parallel_chunks_default*(
## A value of 1 will always parallelize the loop.
## - simd is used by default
omp_parallel_chunks(
length,
length, nb_chunks,
chunk_id, chunk_offset, chunk_size,
omp_threshold = OMP_MEMORY_BOUND_THRESHOLD,
omp_grain_size = OMP_MEMORY_BOUND_GRAIN_SIZE,
Expand Down
3 changes: 2 additions & 1 deletion laser/strided_iteration/foreach.nim
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,9 @@ proc forEachStridedImpl(
)
use_simd = if omp_params.isNil: newLit true else: omp_params[2]
result = quote do:
var nb_chunks: Natural
omp_parallel_chunks(
`size`, `chunk_id`, `chunk_offset`, `chunk_size`,
`size`, nb_chunks, `chunk_id`, `chunk_offset`, `chunk_size`,
`omp_threshold`, `omp_grain_size`, `use_simd`):
`stridedBody`
else:
Expand Down
12 changes: 8 additions & 4 deletions laser/tensor/initialization.nim
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@ proc deepCopy*[T](dst: var Tensor[T], src: Tensor[T]) =
# We use memcpy, due to SIMD optimizations in memcpy,
# we require higher parallelization thresholds
if src.is_C_contiguous:
var nb_chunks: Natural
omp_parallel_chunks(
size, chunk_offset, chunk_size,
size, nb_chunks, chunk_id, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_THRESHOLD * 4, OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
use_simd = false):
copyMem(
Expand Down Expand Up @@ -89,8 +90,9 @@ proc copyFrom*[T](dst: var Tensor[T], src: Tensor[T]) =
# we require higher parallelization thresholds
if src.is_C_contiguous:
assert dst.shape == src.shape
var nb_chunks: Natural
omp_parallel_chunks(
src.size, chunk_offset, chunk_size,
src.size, nb_chunks, chunk_id, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_THRESHOLD * 4, OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
use_simd = false):
copyMem(
Expand All @@ -112,8 +114,9 @@ proc copyFromRaw*[T](dst: var Tensor[T], buffer: ptr UncheckedArray[T], len: Nat
## Destination tensor size and buffer length should be the same
when T.supportsCopyMem:
doAssert dst.size == len, "Tensor size and buffer length should be the same"
var nb_chunks: Natural
omp_parallel_chunks(
len, chunk_offset, chunk_size,
src.size, nb_chunks, chunk_id, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_THRESHOLD * 4, OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
use_simd = false):
copyMem(
Expand Down Expand Up @@ -142,8 +145,9 @@ proc setZero*[T](t: var Tensor[T], check_contiguous: static bool = true) =
when not T.supportsCopyMem:
t.storage.raw_data.reset()
else:
var nb_chunks: Natural
omp_parallel_chunks(
t.size, chunk_offset, chunk_size,
t.size, nb_chunks, chunk_id, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_THRESHOLD * 4, OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
use_simd = false):
zeroMem(
Expand Down

0 comments on commit 9ba351a

Please sign in to comment.