mratsim · mratsim · Nov 5, 2018 · Nov 4, 2018 · Nov 4, 2018 · Nov 4, 2018
diff --git a/laser/openmp.nim b/laser/openmp.nim
@@ -105,10 +105,24 @@ template detachGC*(): untyped =
   if(omp_get_thread_num()!=0):
     teardownForeignThreadGc()
 
+template omp_parallel*(body: untyped): untyped =
+  ## Starts an openMP parallel section
+  ##
+  ## Don't forget to use attachGC and detachGC if you are allocating
+  ## sequences, strings, or reference types.
+  ## Those should be thread-local temporaries.
+  {.emit: "#pragma omp parallel".}
+  block: body
+
+template omp_parallel_if*(condition: bool, body: untyped) =
+  let predicate = condition # Make symbol valid and ensure it's lvalue
+  {.emit: "#pragma omp parallel if (`predicate`)".}
+  block: body
+
 template omp_for*(
     index: untyped,
     length: Natural,
-    use_simd: static bool,
+    use_simd, nowait: static bool,
     body: untyped
   ) =
   ## OpenMP for loop (not parallel)
@@ -132,10 +146,10 @@ template omp_for*(
   ##       x[i+1] += y[i+1]
   ##       x[i+2] += y[i+2]
   ##       ...
-  when use_simd:
-    const omp_annotation = "for simd"
-  else:
-    const omp_annotation = "for"
+  const omp_annotation = block:
+    "for " &
+      (when use_simd: "simd " else: "") &
+      (when nowait: "nowait " else: "")
   for `index`{.inject.} in `||`(0, length-1, omp_annotation):
     block: body
 
@@ -219,6 +233,52 @@ template omp_parallel_for_default*(
         body
         )
 
+template omp_chunks*(
+    omp_size: Natural, #{lvalue} # TODO parameter constraint, pending https://github.com/nim-lang/Nim/issues/9620
+    chunk_offset, chunk_size: untyped,
+    body: untyped): untyped =
+  ## Internal proc
+  ## This is is the chunk part of omp_parallel_chunk
+  ## omp_size should be a lvalue (assigned value) and not
+  ## the result of a routine otherwise routine and its side-effect will be called multiple times
+
+  # The following simple chunking scheme can lead to severe load imbalance
+  #
+  # `chunk_offset`{.inject.} = chunk_size * thread_id
+  # `chunk_size`{.inject.} =  if thread_id < nb_chunks - 1: chunk_size
+  #                           else: omp_size - chunk_offset
+  #
+  # For example dividing 40 items on 12 threads will lead to
+  # a base_chunk_size of 40/12 = 3 so work on the first 11 threads
+  # will be 3 * 11 = 33, and the remainder 7 on the last thread.
+  let
+    nb_chunks = omp_get_num_threads()
+    base_chunk_size = omp_size div nb_chunks
+    remainder = omp_size mod nb_chunks
+    thread_id = omp_get_thread_num()
+
+  # Instead of dividing 40 work items on 12 cores into:
+  # 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40
+  # the following scheme will divide into
+  # 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40
+  #
+  # This is compliant with OpenMP spec (page 60)
+  # http://www.openmp.org/mp-documents/openmp-4.5.pdf
+  # "When no chunk_size is specified, the iteration space is divided into chunks
+  # that are approximately equal in size, and at most one chunk is distributed to
+  # each thread. The size of the chunks is unspecified in this case."
+  # ---> chunks are the same ±1
+
+  var `chunk_offset`{.inject.}, `chunk_size`{.inject.}: Natural
+  if thread_id < remainder:
+    chunk_offset = (base_chunk_size + 1) * thread_id
+    chunk_size = base_chunk_size + 1
+  else:
+    chunk_offset = base_chunk_size * thread_id + remainder
+    chunk_size = base_chunk_size
+
+  block: body
+
 template omp_parallel_chunks*(
     length: Natural,
     chunk_offset, chunk_size: untyped,
@@ -247,21 +307,11 @@ template omp_parallel_chunks*(
     let `chunk_size`{.inject.} = length
     block: body
   else:
-    let
-      omp_size = length # make sure if length is computed it's only done once
-      max_threads = omp_get_max_threads()
-      omp_condition = omp_grain_size * max_threads < omp_size
-
-    {.emit: "#pragma omp parallel if (`omp_condition`)".}
-    block:
-      let
-        nb_chunks = omp_get_num_threads()
-        whole_chunk_size = omp_size div nb_chunks
-        thread_id = omp_get_thread_num()
-        `chunk_offset`{.inject.} = whole_chunk_size * thread_id
-        `chunk_size`{.inject.} =  if thread_id < nb_chunks - 1: whole_chunk_size
-                                    else: ompsize - chunk_offset
-      block: body
+    let omp_size = length # make sure if length is computed it's only done once
+    let over_threshold = omp_grain_size * omp_get_max_threads() < omp_size
+
+    omp_parallel_if(over_threshold):
+      omp_chunks(omp_size, chunk_offset, chunk_size, body)
 
 template omp_parallel_chunks_default*(
     length: Natural,
@@ -282,19 +332,25 @@ template omp_parallel_chunks_default*(
     body
   )
 
-template omp_parallel*(body: untyped): untyped =
-  ## Starts an openMP parallel section
-  ##
-  ## Don't forget to use attachGC and detachGC if you are allocating
-  ## sequences, strings, or reference types.
-  ## Those should be thread-local temporaries.
-  {.emit: "#pragma omp parallel".}
-  block: body
-
 template omp_critical*(body: untyped): untyped =
   {.emit: "#pragma omp critical".}
   block: body
 
 template omp_master*(body: untyped): untyped =
   {.emit: "#pragma omp master".}
   block: body
+
+template omp_barrier*(): untyped =
+  {.emit: "#pragma omp barrier".}
+
+import macros
+macro omp_flush*(variables: varargs[untyped]): untyped =
+  var listvars = "("
+  for i, variable in variables:
+    if i == 0:
+      listvars.add "`" & $variable & "`"
+    else:
+      listvars.add ",`" & $variable & "`"
+  listvars.add ')'
+  result = quote do:
+    {.emit: "#pragma omp flush " & `listvars`.}
diff --git a/laser/strided_iteration/foreach.nim b/laser/strided_iteration/foreach.nim
@@ -37,7 +37,7 @@ export omp_suffix # Pending https://github.com/nim-lang/Nim/issues/9365 or 9366
 
 proc forEachContiguousImpl(
   values, raw_ptrs, size, loopBody: NimNode,
-  use_openmp: static bool, omp_params: NimNode,
+  use_openmp: static bool,
   ): NimNode =
   # Build the parallel body of a contiguous iterator
 
@@ -52,19 +52,9 @@ proc forEachContiguousImpl(
                   )
 
   if use_openmp:
-    if omp_params.isNil:
-      result = quote do:
-        omp_parallel_for_default(`index`, `size`):
-            `body`
-    else:
-      let
-        omp_grain_size = omp_params[0]
-        use_simd       = omp_params[1]
-      result = quote do:
-        omp_parallel_for(
-          `index`, `size`,
-          `omp_grain_size`, `use_simd`):
-            `body`
+    result = quote do:
+      omp_parallel_for_default(`index`, `size`):
+          `body`
   else:
     result = quote do:
       for `index` in 0 ..< `size`:
@@ -74,8 +64,7 @@ proc forEachStridedImpl(
   values, aliases,
   raw_ptrs, size,
   loopBody: NimNode,
-  use_openmp: static bool,
-  omp_params: NimNode,
+  use_openmp: static bool
   ): NimNode =
   # Build the parallel body of a strided iterator
 
@@ -110,14 +99,10 @@ proc forEachStridedImpl(
   let body = loopBody.replaceNodes(replacements = elems_strided, to_replace = values)
   let stridedBody = stridedBodyTemplate()
 
+  let omp_grain_size = newLit( # scale grain_size down for strided operation
+                        OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR
+                      )
   if use_openmp:
-    let
-      omp_grain_size =  if omp_params.isNil: newLit( # scale grain_size down for strided operation
-                          OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR
-                        ) else: newLit(
-                          omp_params[0].intVal div OMP_NON_CONTIGUOUS_SCALE_FACTOR
-                        )
-      use_simd       = if omp_params.isNil: newLit true else: omp_params[1]
     result = quote do:
       omp_parallel_chunks(
         `size`, `chunk_offset`, `chunk_size`,
@@ -126,56 +111,30 @@ proc forEachStridedImpl(
   else:
     result = stridedBody
 
-template forEachContiguousTemplate(use_openmp: static bool){.dirty.} =
+template forEachSimpleTemplate(contiguous, use_openmp: static bool){.dirty.} =
   var
     params, loopBody, values, aliases, raw_ptrs: NimNode
     aliases_stmt, raw_ptrs_stmt, test_shapes: NimNode
-    omp_params: NimNode
-
-  initForEach(
-        args,
-        params,
-        loopBody,
-        omp_params,
-        values, aliases, raw_ptrs,
-        aliases_stmt, raw_ptrs_stmt,
-        test_shapes
-  )
 
-  let size = genSym(nskLet, "size_")
-  let body = forEachContiguousImpl(
-    values, raw_ptrs, size, loopBody, use_openmp, omp_params
-    )
-  let alias0 = aliases[0]
-
-  result = quote do:
-    block:
-      `aliases_stmt`
-      `test_shapes`
-      `raw_ptrs_stmt`
-      let `size` = `alias0`.size
-      `body`
-
-template forEachStridedTemplate(use_openmp: static bool){.dirty.} =
-  var
-    params, loopBody, values, aliases, raw_ptrs: NimNode
-    aliases_stmt, raw_ptrs_stmt, test_shapes: NimNode
-    omp_params: NimNode
+  params = args
+  loopBody = params.pop()
 
   initForEach(
-        args,
         params,
-        loopBody,
-        omp_params,
         values, aliases, raw_ptrs,
         aliases_stmt, raw_ptrs_stmt,
         test_shapes
   )
 
   let size = genSym(nskLet, "size_")
-  let body = forEachStridedImpl(
-    values, aliases, raw_ptrs, size, loopBody, use_openmp, omp_params
-  )
+  let body =  if contiguous:
+                forEachContiguousImpl(
+                  values, raw_ptrs, size, loopBody, use_openmp
+                )
+              else:
+                forEachStridedImpl(
+                  values, aliases, raw_ptrs, size, loopBody, use_openmp
+                )
   let alias0 = aliases[0]
 
   result = quote do:
@@ -190,24 +149,23 @@ template forEachTemplate(use_openmp: static bool) {.dirty.} =
   var
     params, loopBody, values, aliases, raw_ptrs: NimNode
     aliases_stmt, raw_ptrs_stmt, test_shapes: NimNode
-    omp_params: NimNode
+
+  params = args
+  loopBody = params.pop()
 
   initForEach(
-        args,
         params,
-        loopBody,
-        omp_params,
         values, aliases, raw_ptrs,
         aliases_stmt, raw_ptrs_stmt,
         test_shapes
   )
 
   let size = genSym(nskLet, "size_")
   let contiguous_body = forEachContiguousImpl(
-    values, raw_ptrs, size, loopBody, use_openmp, omp_params
+    values, raw_ptrs, size, loopBody, use_openmp
   )
   let strided_body = forEachStridedImpl(
-    values, aliases, raw_ptrs, size, loopBody, use_openmp, omp_params
+    values, aliases, raw_ptrs, size, loopBody, use_openmp
   )
   let alias0 = aliases[0]
   var test_C_Contiguous = newCall(ident"is_C_contiguous", alias0)
@@ -232,52 +190,63 @@ template forEachTemplate(use_openmp: static bool) {.dirty.} =
 
 macro forEachContiguous*(args: varargs[untyped]): untyped =
   ## Format:
-  ## forEachContiguous x in a, y in b, z in c, (1024, true):
+  ## forEachContiguous x in a, y in b, z in c:
   ##    x += y * z
-  ## (1024, true) corresponds to omp_grain_size, use_simd
-  ## from omp_parallel_for
-  forEachContiguousTemplate(true)
+  ##
+  ## The threshold for parallelization by default is
+  ## OMP_MEMORY_BOUND_GRAIN_SIZE = 1024 elementwise operations to process per cores.
+  ##
+  ## Compiler will also be hinted to unroll loop for SIMD vectorization.
+  ##
+  ## Use ``forEachStaged`` to fine-tune those defaults.
+  forEachSimpleTemplate(contiguous = true, use_openmp = true)
 
 macro forEachContiguousSerial*(args: varargs[untyped]): untyped =
   ## Format:
   ## forEachContiguousSerial x in a, y in b, z in c:
   ##    x += y * z
-  ## OpenMP parameters will be ignored
-  forEachContiguousTemplate(false)
+  forEachSimpleTemplate(contiguous = true, use_openmp = false)
 
 macro forEachStrided*(args: varargs[untyped]): untyped =
   ## Format:
-  ## forEachStrided x in a, y in b, z in c, (1024, true):
+  ## forEachStrided x in a, y in b, z in c:
   ##    x += y * z
-  ## (1024, true) corresponds to omp_grain_size, use_simd
-  ## from omp_parallel_for
   ##
-  ## The OpenMP minimal per-core grain size
-  ## is always scaled down by OMP_NON_CONTIGUOUS_SCALE_FACTOR (4 by default)
-  forEachStridedTemplate(true)
+  ## The threshold for parallelization by default is
+  ## OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR =
+  ##   1024/4 = 256 elementwise operations to process per cores.
+  ##
+  ## Use ``forEachStaged`` to fine-tune this default.
+  forEachSimpleTemplate(contiguous = false, use_openmp = true)
 
 macro forEachStridedSerial*(args: varargs[untyped]): untyped =
   ## Format:
   ## forEachStridedSerial x in a, y in b, z in c:
   ##    x += y * z
-  ##
-  ## Strided iteration with serial execution. OpenMP params passed to it will be ignored
-  forEachStridedTemplate(false)
+  forEachSimpleTemplate(contiguous = false, use_openmp = false)
 
 macro forEach*(args: varargs[untyped]): untyped =
   ## Format:
   ## forEach x in a, y in b, z in c, (1024, true):
   ##    x += y * z
-  ## (1024, true) corresponds to omp_grain_size, use_simd
-  ## from omp_parallel_for
   ##
   ## The iteration strategy is selected at runtime depending of
   ## the tensors memory layout. If you know at compile-time that the tensors are
   ## contiguous or strided, use forEachContiguous or forEachStrided instead.
   ## Runtime selection requires duplicating the code body.
   ##
-  ## If the tensors are non-contiguous, the OpenMP minimal per-core grain size
-  ## is scaled down by OMP_NON_CONTIGUOUS_SCALE_FACTOR (4 by default)
+  ## In the contiguous case:
+  ##   The threshold for parallelization by default is
+  ##   OMP_MEMORY_BOUND_GRAIN_SIZE = 1024 elementwise operations to process per cores.
+  ##
+  ##   Compiler will also be hinted to unroll loop for SIMD vectorization.
+  ##
+  ## Otherwise if tensor is strided:
+  ##   The threshold for parallelization by default is
+  ##   OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR =
+  ##     1024/4 = 256 elementwise operations to process per cores.
+  ##
+  ## Use ``forEachStaged`` to fine-tune this default.
   forEachTemplate(true)
 
 macro forEachSerial*(args: varargs[untyped]): untyped =