OMP chunk template returns the number of chunks used

mratsim · Oct 23, 2018 · 9ba351a · 9ba351a
1 parent 41b9aaf
commit 9ba351a
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 9 deletions.
diff --git a/laser/openmp/omp_parallel.nim b/laser/openmp/omp_parallel.nim
@@ -111,7 +111,7 @@ template omp_parallel_for_default*(
     body)
 
 template omp_parallel_chunks*(
-    length: Natural,
+    length: Natural, nb_chunks: var Natural,
     chunk_id, chunk_offset, chunk_size: untyped,
     omp_threshold: static Natural,
     omp_grain_size: static Positive,
@@ -132,12 +132,13 @@ template omp_parallel_chunks*(
   ## or when operating on (contiguous) ranges for example for memset or memcpy
 
   when not defined(openmp):
+    nb_chunks = 1
     const `chunk_offset`{.inject.} = 0
     let `chunk_size`{.inject.} = length
     block: body
   else:
     let ompsize = length # If length is the result of a proc, call the proc only once
-    let nb_chunks = if omp_threshold < ompsize:
+    nb_chunks = if omp_threshold < ompsize:
       min(
         omp_get_max_threads(),
         max(1, ompsize div omp_grain_size) # if ompsize < omp_grain_size
@@ -159,7 +160,7 @@ template omp_parallel_chunks*(
         block: body
 
 template omp_parallel_chunks_default*(
-    length: Natural,
+    length: Natural, nb_chunks: var Natural,
     chunk_id, chunk_offset, chunk_size: untyped,
     body: untyped): untyped =
   ## This will be renamed omp_parallel_chunks once
@@ -176,7 +177,7 @@ template omp_parallel_chunks_default*(
   ##     A value of 1 will always parallelize the loop.
   ## - simd is used by default
   omp_parallel_chunks(
-    length,
+    length, nb_chunks,
     chunk_id, chunk_offset, chunk_size,
     omp_threshold = OMP_MEMORY_BOUND_THRESHOLD,
     omp_grain_size = OMP_MEMORY_BOUND_GRAIN_SIZE,

diff --git a/laser/strided_iteration/foreach.nim b/laser/strided_iteration/foreach.nim
@@ -155,8 +155,9 @@ proc forEachStridedImpl(
                         )
       use_simd       = if omp_params.isNil: newLit true else: omp_params[2]
     result = quote do:
+      var nb_chunks: Natural
       omp_parallel_chunks(
-        `size`, `chunk_id`, `chunk_offset`, `chunk_size`,
+        `size`, nb_chunks, `chunk_id`, `chunk_offset`, `chunk_size`,
         `omp_threshold`, `omp_grain_size`, `use_simd`):
           `stridedBody`
   else:

diff --git a/laser/tensor/initialization.nim b/laser/tensor/initialization.nim
@@ -52,8 +52,9 @@ proc deepCopy*[T](dst: var Tensor[T], src: Tensor[T]) =
     # We use memcpy, due to SIMD optimizations in memcpy,
     # we require higher parallelization thresholds
     if src.is_C_contiguous:
+      var nb_chunks: Natural
       omp_parallel_chunks(
-            size, chunk_offset, chunk_size,
+            size, nb_chunks, chunk_id, chunk_offset, chunk_size,
             OMP_MEMORY_BOUND_THRESHOLD * 4, OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
             use_simd = false):
         copyMem(
@@ -89,8 +90,9 @@ proc copyFrom*[T](dst: var Tensor[T], src: Tensor[T]) =
     # we require higher parallelization thresholds
     if src.is_C_contiguous:
       assert dst.shape == src.shape
+      var nb_chunks: Natural
       omp_parallel_chunks(
-            src.size, chunk_offset, chunk_size,
+            src.size, nb_chunks, chunk_id, chunk_offset, chunk_size,
             OMP_MEMORY_BOUND_THRESHOLD * 4, OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
             use_simd = false):
         copyMem(
@@ -112,8 +114,9 @@ proc copyFromRaw*[T](dst: var Tensor[T], buffer: ptr UncheckedArray[T], len: Nat
   ## Destination tensor size and buffer length should be the same
   when T.supportsCopyMem:
     doAssert dst.size == len, "Tensor size and buffer length should be the same"
+    var nb_chunks: Natural
     omp_parallel_chunks(
-            len, chunk_offset, chunk_size,
+            src.size, nb_chunks, chunk_id, chunk_offset, chunk_size,
             OMP_MEMORY_BOUND_THRESHOLD * 4, OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
             use_simd = false):
         copyMem(
@@ -142,8 +145,9 @@ proc setZero*[T](t: var Tensor[T], check_contiguous: static bool = true) =
   when not T.supportsCopyMem:
     t.storage.raw_data.reset()
   else:
+    var nb_chunks: Natural
     omp_parallel_chunks(
-          t.size, chunk_offset, chunk_size,
+          t.size, nb_chunks, chunk_id, chunk_offset, chunk_size,
           OMP_MEMORY_BOUND_THRESHOLD * 4, OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
           use_simd = false):
       zeroMem(