Skip to content

Commit

Permalink
Merge pull request #4 from numforge/foreach-dsl
Browse files Browse the repository at this point in the history
Introduce forEach multi-stage domain specific language
  • Loading branch information
mratsim committed Nov 5, 2018
2 parents 104e9ee + 2e40b75 commit cd96db9
Show file tree
Hide file tree
Showing 4 changed files with 495 additions and 142 deletions.
114 changes: 85 additions & 29 deletions laser/openmp.nim
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,24 @@ template detachGC*(): untyped =
if(omp_get_thread_num()!=0):
teardownForeignThreadGc()

template omp_parallel*(body: untyped): untyped =
## Starts an openMP parallel section
##
## Don't forget to use attachGC and detachGC if you are allocating
## sequences, strings, or reference types.
## Those should be thread-local temporaries.
{.emit: "#pragma omp parallel".}
block: body

template omp_parallel_if*(condition: bool, body: untyped) =
let predicate = condition # Make symbol valid and ensure it's lvalue
{.emit: "#pragma omp parallel if (`predicate`)".}
block: body

template omp_for*(
index: untyped,
length: Natural,
use_simd: static bool,
use_simd, nowait: static bool,
body: untyped
) =
## OpenMP for loop (not parallel)
Expand All @@ -132,10 +146,10 @@ template omp_for*(
## x[i+1] += y[i+1]
## x[i+2] += y[i+2]
## ...
when use_simd:
const omp_annotation = "for simd"
else:
const omp_annotation = "for"
const omp_annotation = block:
"for " &
(when use_simd: "simd " else: "") &
(when nowait: "nowait " else: "")
for `index`{.inject.} in `||`(0, length-1, omp_annotation):
block: body

Expand Down Expand Up @@ -219,6 +233,52 @@ template omp_parallel_for_default*(
body
)

template omp_chunks*(
omp_size: Natural, #{lvalue} # TODO parameter constraint, pending https://github.com/nim-lang/Nim/issues/9620
chunk_offset, chunk_size: untyped,
body: untyped): untyped =
## Internal proc
## This is is the chunk part of omp_parallel_chunk
## omp_size should be a lvalue (assigned value) and not
## the result of a routine otherwise routine and its side-effect will be called multiple times

# The following simple chunking scheme can lead to severe load imbalance
#
# `chunk_offset`{.inject.} = chunk_size * thread_id
# `chunk_size`{.inject.} = if thread_id < nb_chunks - 1: chunk_size
# else: omp_size - chunk_offset
#
# For example dividing 40 items on 12 threads will lead to
# a base_chunk_size of 40/12 = 3 so work on the first 11 threads
# will be 3 * 11 = 33, and the remainder 7 on the last thread.
let
nb_chunks = omp_get_num_threads()
base_chunk_size = omp_size div nb_chunks
remainder = omp_size mod nb_chunks
thread_id = omp_get_thread_num()

# Instead of dividing 40 work items on 12 cores into:
# 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40
# the following scheme will divide into
# 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40
#
# This is compliant with OpenMP spec (page 60)
# http://www.openmp.org/mp-documents/openmp-4.5.pdf
# "When no chunk_size is specified, the iteration space is divided into chunks
# that are approximately equal in size, and at most one chunk is distributed to
# each thread. The size of the chunks is unspecified in this case."
# ---> chunks are the same ±1

var `chunk_offset`{.inject.}, `chunk_size`{.inject.}: Natural
if thread_id < remainder:
chunk_offset = (base_chunk_size + 1) * thread_id
chunk_size = base_chunk_size + 1
else:
chunk_offset = base_chunk_size * thread_id + remainder
chunk_size = base_chunk_size

block: body

template omp_parallel_chunks*(
length: Natural,
chunk_offset, chunk_size: untyped,
Expand Down Expand Up @@ -247,21 +307,11 @@ template omp_parallel_chunks*(
let `chunk_size`{.inject.} = length
block: body
else:
let
omp_size = length # make sure if length is computed it's only done once
max_threads = omp_get_max_threads()
omp_condition = omp_grain_size * max_threads < omp_size

{.emit: "#pragma omp parallel if (`omp_condition`)".}
block:
let
nb_chunks = omp_get_num_threads()
whole_chunk_size = omp_size div nb_chunks
thread_id = omp_get_thread_num()
`chunk_offset`{.inject.} = whole_chunk_size * thread_id
`chunk_size`{.inject.} = if thread_id < nb_chunks - 1: whole_chunk_size
else: ompsize - chunk_offset
block: body
let omp_size = length # make sure if length is computed it's only done once
let over_threshold = omp_grain_size * omp_get_max_threads() < omp_size

omp_parallel_if(over_threshold):
omp_chunks(omp_size, chunk_offset, chunk_size, body)

template omp_parallel_chunks_default*(
length: Natural,
Expand All @@ -282,19 +332,25 @@ template omp_parallel_chunks_default*(
body
)

template omp_parallel*(body: untyped): untyped =
## Starts an openMP parallel section
##
## Don't forget to use attachGC and detachGC if you are allocating
## sequences, strings, or reference types.
## Those should be thread-local temporaries.
{.emit: "#pragma omp parallel".}
block: body

template omp_critical*(body: untyped): untyped =
{.emit: "#pragma omp critical".}
block: body

template omp_master*(body: untyped): untyped =
{.emit: "#pragma omp master".}
block: body

template omp_barrier*(): untyped =
{.emit: "#pragma omp barrier".}

import macros
macro omp_flush*(variables: varargs[untyped]): untyped =
var listvars = "("
for i, variable in variables:
if i == 0:
listvars.add "`" & $variable & "`"
else:
listvars.add ",`" & $variable & "`"
listvars.add ')'
result = quote do:
{.emit: "#pragma omp flush " & `listvars`.}
139 changes: 54 additions & 85 deletions laser/strided_iteration/foreach.nim
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export omp_suffix # Pending https://github.com/nim-lang/Nim/issues/9365 or 9366

proc forEachContiguousImpl(
values, raw_ptrs, size, loopBody: NimNode,
use_openmp: static bool, omp_params: NimNode,
use_openmp: static bool,
): NimNode =
# Build the parallel body of a contiguous iterator

Expand All @@ -52,19 +52,9 @@ proc forEachContiguousImpl(
)

if use_openmp:
if omp_params.isNil:
result = quote do:
omp_parallel_for_default(`index`, `size`):
`body`
else:
let
omp_grain_size = omp_params[0]
use_simd = omp_params[1]
result = quote do:
omp_parallel_for(
`index`, `size`,
`omp_grain_size`, `use_simd`):
`body`
result = quote do:
omp_parallel_for_default(`index`, `size`):
`body`
else:
result = quote do:
for `index` in 0 ..< `size`:
Expand All @@ -74,8 +64,7 @@ proc forEachStridedImpl(
values, aliases,
raw_ptrs, size,
loopBody: NimNode,
use_openmp: static bool,
omp_params: NimNode,
use_openmp: static bool
): NimNode =
# Build the parallel body of a strided iterator

Expand Down Expand Up @@ -110,14 +99,10 @@ proc forEachStridedImpl(
let body = loopBody.replaceNodes(replacements = elems_strided, to_replace = values)
let stridedBody = stridedBodyTemplate()

let omp_grain_size = newLit( # scale grain_size down for strided operation
OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR
)
if use_openmp:
let
omp_grain_size = if omp_params.isNil: newLit( # scale grain_size down for strided operation
OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR
) else: newLit(
omp_params[0].intVal div OMP_NON_CONTIGUOUS_SCALE_FACTOR
)
use_simd = if omp_params.isNil: newLit true else: omp_params[1]
result = quote do:
omp_parallel_chunks(
`size`, `chunk_offset`, `chunk_size`,
Expand All @@ -126,56 +111,30 @@ proc forEachStridedImpl(
else:
result = stridedBody

template forEachContiguousTemplate(use_openmp: static bool){.dirty.} =
template forEachSimpleTemplate(contiguous, use_openmp: static bool){.dirty.} =
var
params, loopBody, values, aliases, raw_ptrs: NimNode
aliases_stmt, raw_ptrs_stmt, test_shapes: NimNode
omp_params: NimNode

initForEach(
args,
params,
loopBody,
omp_params,
values, aliases, raw_ptrs,
aliases_stmt, raw_ptrs_stmt,
test_shapes
)

let size = genSym(nskLet, "size_")
let body = forEachContiguousImpl(
values, raw_ptrs, size, loopBody, use_openmp, omp_params
)
let alias0 = aliases[0]

result = quote do:
block:
`aliases_stmt`
`test_shapes`
`raw_ptrs_stmt`
let `size` = `alias0`.size
`body`

template forEachStridedTemplate(use_openmp: static bool){.dirty.} =
var
params, loopBody, values, aliases, raw_ptrs: NimNode
aliases_stmt, raw_ptrs_stmt, test_shapes: NimNode
omp_params: NimNode
params = args
loopBody = params.pop()

initForEach(
args,
params,
loopBody,
omp_params,
values, aliases, raw_ptrs,
aliases_stmt, raw_ptrs_stmt,
test_shapes
)

let size = genSym(nskLet, "size_")
let body = forEachStridedImpl(
values, aliases, raw_ptrs, size, loopBody, use_openmp, omp_params
)
let body = if contiguous:
forEachContiguousImpl(
values, raw_ptrs, size, loopBody, use_openmp
)
else:
forEachStridedImpl(
values, aliases, raw_ptrs, size, loopBody, use_openmp
)
let alias0 = aliases[0]

result = quote do:
Expand All @@ -190,24 +149,23 @@ template forEachTemplate(use_openmp: static bool) {.dirty.} =
var
params, loopBody, values, aliases, raw_ptrs: NimNode
aliases_stmt, raw_ptrs_stmt, test_shapes: NimNode
omp_params: NimNode

params = args
loopBody = params.pop()

initForEach(
args,
params,
loopBody,
omp_params,
values, aliases, raw_ptrs,
aliases_stmt, raw_ptrs_stmt,
test_shapes
)

let size = genSym(nskLet, "size_")
let contiguous_body = forEachContiguousImpl(
values, raw_ptrs, size, loopBody, use_openmp, omp_params
values, raw_ptrs, size, loopBody, use_openmp
)
let strided_body = forEachStridedImpl(
values, aliases, raw_ptrs, size, loopBody, use_openmp, omp_params
values, aliases, raw_ptrs, size, loopBody, use_openmp
)
let alias0 = aliases[0]
var test_C_Contiguous = newCall(ident"is_C_contiguous", alias0)
Expand All @@ -232,52 +190,63 @@ template forEachTemplate(use_openmp: static bool) {.dirty.} =

macro forEachContiguous*(args: varargs[untyped]): untyped =
## Format:
## forEachContiguous x in a, y in b, z in c, (1024, true):
## forEachContiguous x in a, y in b, z in c:
## x += y * z
## (1024, true) corresponds to omp_grain_size, use_simd
## from omp_parallel_for
forEachContiguousTemplate(true)
##
## The threshold for parallelization by default is
## OMP_MEMORY_BOUND_GRAIN_SIZE = 1024 elementwise operations to process per cores.
##
## Compiler will also be hinted to unroll loop for SIMD vectorization.
##
## Use ``forEachStaged`` to fine-tune those defaults.
forEachSimpleTemplate(contiguous = true, use_openmp = true)

macro forEachContiguousSerial*(args: varargs[untyped]): untyped =
## Format:
## forEachContiguousSerial x in a, y in b, z in c:
## x += y * z
## OpenMP parameters will be ignored
forEachContiguousTemplate(false)
forEachSimpleTemplate(contiguous = true, use_openmp = false)

macro forEachStrided*(args: varargs[untyped]): untyped =
## Format:
## forEachStrided x in a, y in b, z in c, (1024, true):
## forEachStrided x in a, y in b, z in c:
## x += y * z
## (1024, true) corresponds to omp_grain_size, use_simd
## from omp_parallel_for
##
## The OpenMP minimal per-core grain size
## is always scaled down by OMP_NON_CONTIGUOUS_SCALE_FACTOR (4 by default)
forEachStridedTemplate(true)
## The threshold for parallelization by default is
## OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR =
## 1024/4 = 256 elementwise operations to process per cores.
##
## Use ``forEachStaged`` to fine-tune this default.
forEachSimpleTemplate(contiguous = false, use_openmp = true)

macro forEachStridedSerial*(args: varargs[untyped]): untyped =
## Format:
## forEachStridedSerial x in a, y in b, z in c:
## x += y * z
##
## Strided iteration with serial execution. OpenMP params passed to it will be ignored
forEachStridedTemplate(false)
forEachSimpleTemplate(contiguous = false, use_openmp = false)

macro forEach*(args: varargs[untyped]): untyped =
## Format:
## forEach x in a, y in b, z in c, (1024, true):
## x += y * z
## (1024, true) corresponds to omp_grain_size, use_simd
## from omp_parallel_for
##
## The iteration strategy is selected at runtime depending of
## the tensors memory layout. If you know at compile-time that the tensors are
## contiguous or strided, use forEachContiguous or forEachStrided instead.
## Runtime selection requires duplicating the code body.
##
## If the tensors are non-contiguous, the OpenMP minimal per-core grain size
## is scaled down by OMP_NON_CONTIGUOUS_SCALE_FACTOR (4 by default)
## In the contiguous case:
## The threshold for parallelization by default is
## OMP_MEMORY_BOUND_GRAIN_SIZE = 1024 elementwise operations to process per cores.
##
## Compiler will also be hinted to unroll loop for SIMD vectorization.
##
## Otherwise if tensor is strided:
## The threshold for parallelization by default is
## OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR =
## 1024/4 = 256 elementwise operations to process per cores.
##
## Use ``forEachStaged`` to fine-tune this default.
forEachTemplate(true)

macro forEachSerial*(args: varargs[untyped]): untyped =
Expand Down

0 comments on commit cd96db9

Please sign in to comment.