-
Notifications
You must be signed in to change notification settings - Fork 95
/
openmp.nim
157 lines (126 loc) · 5.7 KB
/
openmp.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Copyright 2017 the Arraymancer contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ./global_config,
./memory_optimization_hints
when defined(openmp):
when not defined(cuda): # For cuda, OpenMP flags must be passeed
{.passC: "-fopenmp".} # behind -Xcompiler -fopenmp
{.passL: "-fopenmp".}
{.pragma: omp, header:"omp.h".}
proc omp_set_num_threads*(x: cint) {.omp.}
proc omp_get_num_threads*(): cint {.omp.}
proc omp_get_max_threads*(): cint {.omp.}
proc omp_get_thread_num*(): cint {.omp.}
else:
template omp_set_num_threads*(x: cint) = discard
template omp_get_num_threads*(): cint = 1
template omp_get_max_threads*(): cint = 1
template omp_get_thread_num*(): cint = 0
const OMP_FOR_ANNOTATION = "simd if(ompsize > " & $OMP_FOR_THRESHOLD & ")"
template omp_parallel_countup*(i: untyped, size: Natural, body: untyped): untyped =
let ompsize = size # ensure that if size is computed it is only called once
for i in `||`(0, ompsize, OMP_FOR_ANNOTATION):
body
template omp_parallel_forup*(i: untyped, start, size: Natural, body: untyped): untyped =
let ompsize = size # ensure that if size is computed it is only called once
for i in `||`(start, ompsize, OMP_FOR_ANNOTATION):
body
template omp_parallel_blocks*(block_offset, block_size: untyped, size: Natural, body: untyped): untyped =
let ompsize = size # ensure that if size is computed it is only called once
if likely(ompsize > 0):
block ompblocks:
when defined(openmp):
if ompsize >= OMP_FOR_THRESHOLD:
let num_blocks = min(omp_get_max_threads(), ompsize)
if num_blocks > 1:
let bsize = ompsize div num_blocks
for block_index in `||`(0, num_blocks-1, "simd"):
# block_offset and block_size are injected into the calling proc
let block_offset = bsize*block_index
let block_size = if block_index < num_blocks-1: bsize else: ompsize - block_offset
block:
body
break ompblocks
# block_offset and block_size are injected into the calling proc
let block_offset = 0
let block_size = ompsize
block:
body
template omp_parallel_reduce_blocks*[T](reduced: T, block_offset, block_size: untyped, size, weight: Natural, op_final, op_init, op_middle: untyped): untyped =
# To prevent false sharing, results will be stored in an array but
# padded to be a cache line apart atleast.
# All CPUs cache line is 64B, 16 float32/int32 fits or 8 float64/int64
# TODO compile time evaluation depending of sizeof(T)
# Pending https://github.com/nim-lang/Nim/pull/5664
const maxItemsPerCacheLine = 16
let ompsize = size # ensure that if size is computed it is only called once
if likely(ompsize > 0):
block ompblocks:
when defined(openmp):
if ompsize * weight >= OMP_FOR_THRESHOLD:
let num_blocks = min(min(ompsize, omp_get_max_threads()), OMP_MAX_REDUCE_BLOCKS)
if num_blocks > 1:
withMemoryOptimHints()
var results{.align64, noInit.}: array[OMP_MAX_REDUCE_BLOCKS * maxItemsPerCacheLine, type(reduced)]
let bsize = ompsize div num_blocks
if bsize > 1:
# Initialize first elements
for block_index in 0..<num_blocks:
# block_offset and block_size are injected into the calling proc
let block_offset = bsize*block_index
let block_size = if block_index < num_blocks-1: bsize else: ompsize - block_offset
# Inject x using a template to able to mutate it
template x(): untyped =
results[block_index * maxItemsPerCacheLine]
block:
op_init
# Reduce blocks
for block_index in `||`(0, num_blocks-1, "simd"):
# block_offset and block_size are injected into the calling proc
var block_offset = bsize*block_index
let block_size = (if block_index < num_blocks-1: bsize else: ompsize - block_offset) - 1
block_offset += 1
# Inject x using a template to able to mutate it
template x(): untyped =
results[block_index * maxItemsPerCacheLine]
block:
op_middle
# Finally reduce results from openmp
block:
shallowCopy(reduced, results[0])
# Inject x using a template to able to mutate it
template x(): untyped =
reduced
for block_index in 1..<num_blocks:
let y {.inject.} = results[block_index * maxItemsPerCacheLine]
op_final
break ompblocks
# Fallback normal sequential reduce
block:
# Initialize first elements
var block_offset = 0
block:
template x(): untyped =
reduced
block:
op_init
# Offset to reduce rest of elements
block_offset = 1
let block_size = ompsize-1
if block_size > 0:
# Inject x using a template to able to mutate it
template x(): untyped =
reduced
block:
op_middle