Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce forEach multi-stage domain specific language #4

Merged
merged 13 commits into from
Nov 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
114 changes: 85 additions & 29 deletions laser/openmp.nim
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,24 @@ template detachGC*(): untyped =
if(omp_get_thread_num()!=0):
teardownForeignThreadGc()

template omp_parallel*(body: untyped): untyped =
## Starts an openMP parallel section
##
## Don't forget to use attachGC and detachGC if you are allocating
## sequences, strings, or reference types.
## Those should be thread-local temporaries.
{.emit: "#pragma omp parallel".}
block: body

template omp_parallel_if*(condition: bool, body: untyped) =
let predicate = condition # Make symbol valid and ensure it's lvalue
{.emit: "#pragma omp parallel if (`predicate`)".}
block: body

template omp_for*(
index: untyped,
length: Natural,
use_simd: static bool,
use_simd, nowait: static bool,
body: untyped
) =
## OpenMP for loop (not parallel)
Expand All @@ -132,10 +146,10 @@ template omp_for*(
## x[i+1] += y[i+1]
## x[i+2] += y[i+2]
## ...
when use_simd:
const omp_annotation = "for simd"
else:
const omp_annotation = "for"
const omp_annotation = block:
"for " &
(when use_simd: "simd " else: "") &
(when nowait: "nowait " else: "")
for `index`{.inject.} in `||`(0, length-1, omp_annotation):
block: body

Expand Down Expand Up @@ -219,6 +233,52 @@ template omp_parallel_for_default*(
body
)

template omp_chunks*(
omp_size: Natural, #{lvalue} # TODO parameter constraint, pending https://github.com/nim-lang/Nim/issues/9620
chunk_offset, chunk_size: untyped,
body: untyped): untyped =
## Internal proc
## This is is the chunk part of omp_parallel_chunk
## omp_size should be a lvalue (assigned value) and not
## the result of a routine otherwise routine and its side-effect will be called multiple times

# The following simple chunking scheme can lead to severe load imbalance
#
# `chunk_offset`{.inject.} = chunk_size * thread_id
# `chunk_size`{.inject.} = if thread_id < nb_chunks - 1: chunk_size
# else: omp_size - chunk_offset
#
# For example dividing 40 items on 12 threads will lead to
# a base_chunk_size of 40/12 = 3 so work on the first 11 threads
# will be 3 * 11 = 33, and the remainder 7 on the last thread.
let
nb_chunks = omp_get_num_threads()
base_chunk_size = omp_size div nb_chunks
remainder = omp_size mod nb_chunks
thread_id = omp_get_thread_num()

# Instead of dividing 40 work items on 12 cores into:
# 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40
# the following scheme will divide into
# 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40
#
# This is compliant with OpenMP spec (page 60)
# http://www.openmp.org/mp-documents/openmp-4.5.pdf
# "When no chunk_size is specified, the iteration space is divided into chunks
# that are approximately equal in size, and at most one chunk is distributed to
# each thread. The size of the chunks is unspecified in this case."
# ---> chunks are the same ±1

var `chunk_offset`{.inject.}, `chunk_size`{.inject.}: Natural
if thread_id < remainder:
chunk_offset = (base_chunk_size + 1) * thread_id
chunk_size = base_chunk_size + 1
else:
chunk_offset = base_chunk_size * thread_id + remainder
chunk_size = base_chunk_size

block: body

template omp_parallel_chunks*(
length: Natural,
chunk_offset, chunk_size: untyped,
Expand Down Expand Up @@ -247,21 +307,11 @@ template omp_parallel_chunks*(
let `chunk_size`{.inject.} = length
block: body
else:
let
omp_size = length # make sure if length is computed it's only done once
max_threads = omp_get_max_threads()
omp_condition = omp_grain_size * max_threads < omp_size

{.emit: "#pragma omp parallel if (`omp_condition`)".}
block:
let
nb_chunks = omp_get_num_threads()
whole_chunk_size = omp_size div nb_chunks
thread_id = omp_get_thread_num()
`chunk_offset`{.inject.} = whole_chunk_size * thread_id
`chunk_size`{.inject.} = if thread_id < nb_chunks - 1: whole_chunk_size
else: ompsize - chunk_offset
block: body
let omp_size = length # make sure if length is computed it's only done once
let over_threshold = omp_grain_size * omp_get_max_threads() < omp_size

omp_parallel_if(over_threshold):
omp_chunks(omp_size, chunk_offset, chunk_size, body)

template omp_parallel_chunks_default*(
length: Natural,
Expand All @@ -282,19 +332,25 @@ template omp_parallel_chunks_default*(
body
)

template omp_parallel*(body: untyped): untyped =
## Starts an openMP parallel section
##
## Don't forget to use attachGC and detachGC if you are allocating
## sequences, strings, or reference types.
## Those should be thread-local temporaries.
{.emit: "#pragma omp parallel".}
block: body

template omp_critical*(body: untyped): untyped =
{.emit: "#pragma omp critical".}
block: body

template omp_master*(body: untyped): untyped =
{.emit: "#pragma omp master".}
block: body

template omp_barrier*(): untyped =
{.emit: "#pragma omp barrier".}

import macros
macro omp_flush*(variables: varargs[untyped]): untyped =
var listvars = "("
for i, variable in variables:
if i == 0:
listvars.add "`" & $variable & "`"
else:
listvars.add ",`" & $variable & "`"
listvars.add ')'
result = quote do:
{.emit: "#pragma omp flush " & `listvars`.}
139 changes: 54 additions & 85 deletions laser/strided_iteration/foreach.nim
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export omp_suffix # Pending https://github.com/nim-lang/Nim/issues/9365 or 9366

proc forEachContiguousImpl(
values, raw_ptrs, size, loopBody: NimNode,
use_openmp: static bool, omp_params: NimNode,
use_openmp: static bool,
): NimNode =
# Build the parallel body of a contiguous iterator

Expand All @@ -52,19 +52,9 @@ proc forEachContiguousImpl(
)

if use_openmp:
if omp_params.isNil:
result = quote do:
omp_parallel_for_default(`index`, `size`):
`body`
else:
let
omp_grain_size = omp_params[0]
use_simd = omp_params[1]
result = quote do:
omp_parallel_for(
`index`, `size`,
`omp_grain_size`, `use_simd`):
`body`
result = quote do:
omp_parallel_for_default(`index`, `size`):
`body`
else:
result = quote do:
for `index` in 0 ..< `size`:
Expand All @@ -74,8 +64,7 @@ proc forEachStridedImpl(
values, aliases,
raw_ptrs, size,
loopBody: NimNode,
use_openmp: static bool,
omp_params: NimNode,
use_openmp: static bool
): NimNode =
# Build the parallel body of a strided iterator

Expand Down Expand Up @@ -110,14 +99,10 @@ proc forEachStridedImpl(
let body = loopBody.replaceNodes(replacements = elems_strided, to_replace = values)
let stridedBody = stridedBodyTemplate()

let omp_grain_size = newLit( # scale grain_size down for strided operation
OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR
)
if use_openmp:
let
omp_grain_size = if omp_params.isNil: newLit( # scale grain_size down for strided operation
OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR
) else: newLit(
omp_params[0].intVal div OMP_NON_CONTIGUOUS_SCALE_FACTOR
)
use_simd = if omp_params.isNil: newLit true else: omp_params[1]
result = quote do:
omp_parallel_chunks(
`size`, `chunk_offset`, `chunk_size`,
Expand All @@ -126,56 +111,30 @@ proc forEachStridedImpl(
else:
result = stridedBody

template forEachContiguousTemplate(use_openmp: static bool){.dirty.} =
template forEachSimpleTemplate(contiguous, use_openmp: static bool){.dirty.} =
var
params, loopBody, values, aliases, raw_ptrs: NimNode
aliases_stmt, raw_ptrs_stmt, test_shapes: NimNode
omp_params: NimNode

initForEach(
args,
params,
loopBody,
omp_params,
values, aliases, raw_ptrs,
aliases_stmt, raw_ptrs_stmt,
test_shapes
)

let size = genSym(nskLet, "size_")
let body = forEachContiguousImpl(
values, raw_ptrs, size, loopBody, use_openmp, omp_params
)
let alias0 = aliases[0]

result = quote do:
block:
`aliases_stmt`
`test_shapes`
`raw_ptrs_stmt`
let `size` = `alias0`.size
`body`

template forEachStridedTemplate(use_openmp: static bool){.dirty.} =
var
params, loopBody, values, aliases, raw_ptrs: NimNode
aliases_stmt, raw_ptrs_stmt, test_shapes: NimNode
omp_params: NimNode
params = args
loopBody = params.pop()

initForEach(
args,
params,
loopBody,
omp_params,
values, aliases, raw_ptrs,
aliases_stmt, raw_ptrs_stmt,
test_shapes
)

let size = genSym(nskLet, "size_")
let body = forEachStridedImpl(
values, aliases, raw_ptrs, size, loopBody, use_openmp, omp_params
)
let body = if contiguous:
forEachContiguousImpl(
values, raw_ptrs, size, loopBody, use_openmp
)
else:
forEachStridedImpl(
values, aliases, raw_ptrs, size, loopBody, use_openmp
)
let alias0 = aliases[0]

result = quote do:
Expand All @@ -190,24 +149,23 @@ template forEachTemplate(use_openmp: static bool) {.dirty.} =
var
params, loopBody, values, aliases, raw_ptrs: NimNode
aliases_stmt, raw_ptrs_stmt, test_shapes: NimNode
omp_params: NimNode

params = args
loopBody = params.pop()

initForEach(
args,
params,
loopBody,
omp_params,
values, aliases, raw_ptrs,
aliases_stmt, raw_ptrs_stmt,
test_shapes
)

let size = genSym(nskLet, "size_")
let contiguous_body = forEachContiguousImpl(
values, raw_ptrs, size, loopBody, use_openmp, omp_params
values, raw_ptrs, size, loopBody, use_openmp
)
let strided_body = forEachStridedImpl(
values, aliases, raw_ptrs, size, loopBody, use_openmp, omp_params
values, aliases, raw_ptrs, size, loopBody, use_openmp
)
let alias0 = aliases[0]
var test_C_Contiguous = newCall(ident"is_C_contiguous", alias0)
Expand All @@ -232,52 +190,63 @@ template forEachTemplate(use_openmp: static bool) {.dirty.} =

macro forEachContiguous*(args: varargs[untyped]): untyped =
## Format:
## forEachContiguous x in a, y in b, z in c, (1024, true):
## forEachContiguous x in a, y in b, z in c:
## x += y * z
## (1024, true) corresponds to omp_grain_size, use_simd
## from omp_parallel_for
forEachContiguousTemplate(true)
##
## The threshold for parallelization by default is
## OMP_MEMORY_BOUND_GRAIN_SIZE = 1024 elementwise operations to process per cores.
##
## Compiler will also be hinted to unroll loop for SIMD vectorization.
##
## Use ``forEachStaged`` to fine-tune those defaults.
forEachSimpleTemplate(contiguous = true, use_openmp = true)

macro forEachContiguousSerial*(args: varargs[untyped]): untyped =
## Format:
## forEachContiguousSerial x in a, y in b, z in c:
## x += y * z
## OpenMP parameters will be ignored
forEachContiguousTemplate(false)
forEachSimpleTemplate(contiguous = true, use_openmp = false)

macro forEachStrided*(args: varargs[untyped]): untyped =
## Format:
## forEachStrided x in a, y in b, z in c, (1024, true):
## forEachStrided x in a, y in b, z in c:
## x += y * z
## (1024, true) corresponds to omp_grain_size, use_simd
## from omp_parallel_for
##
## The OpenMP minimal per-core grain size
## is always scaled down by OMP_NON_CONTIGUOUS_SCALE_FACTOR (4 by default)
forEachStridedTemplate(true)
## The threshold for parallelization by default is
## OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR =
## 1024/4 = 256 elementwise operations to process per cores.
##
## Use ``forEachStaged`` to fine-tune this default.
forEachSimpleTemplate(contiguous = false, use_openmp = true)

macro forEachStridedSerial*(args: varargs[untyped]): untyped =
## Format:
## forEachStridedSerial x in a, y in b, z in c:
## x += y * z
##
## Strided iteration with serial execution. OpenMP params passed to it will be ignored
forEachStridedTemplate(false)
forEachSimpleTemplate(contiguous = false, use_openmp = false)

macro forEach*(args: varargs[untyped]): untyped =
## Format:
## forEach x in a, y in b, z in c, (1024, true):
## x += y * z
## (1024, true) corresponds to omp_grain_size, use_simd
## from omp_parallel_for
##
## The iteration strategy is selected at runtime depending of
## the tensors memory layout. If you know at compile-time that the tensors are
## contiguous or strided, use forEachContiguous or forEachStrided instead.
## Runtime selection requires duplicating the code body.
##
## If the tensors are non-contiguous, the OpenMP minimal per-core grain size
## is scaled down by OMP_NON_CONTIGUOUS_SCALE_FACTOR (4 by default)
## In the contiguous case:
## The threshold for parallelization by default is
## OMP_MEMORY_BOUND_GRAIN_SIZE = 1024 elementwise operations to process per cores.
##
## Compiler will also be hinted to unroll loop for SIMD vectorization.
##
## Otherwise if tensor is strided:
## The threshold for parallelization by default is
## OMP_MEMORY_BOUND_GRAIN_SIZE div OMP_NON_CONTIGUOUS_SCALE_FACTOR =
## 1024/4 = 256 elementwise operations to process per cores.
##
## Use ``forEachStaged`` to fine-tune this default.
forEachTemplate(true)

macro forEachSerial*(args: varargs[untyped]): untyped =
Expand Down