-
Notifications
You must be signed in to change notification settings - Fork 95
/
initialization.nim
265 lines (237 loc) · 9.31 KB
/
initialization.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# Laser
# Copyright (c) 2018 Mamy André-Ratsimbazafy
# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
# This file may not be copied, modified, or distributed except according to those terms.
import
../openmp,
../compiler_optim_hints,
../strided_iteration/foreach,
../dynamic_stack_arrays,
../private/nested_containers,
./datatypes,
../../std_version_types,
# Standard library
typetraits, sequtils,
# Third-party
nimblas
## Initialization and copy routines
func toMetadata*(s: varargs[int]): Metadata =
result.len = s.len
for i in 0 ..< s.len:
result.data[i] = s[i]
template toMetadata*(m: Metadata): Metadata = m
template initTensorMetadataImpl(
result: var Tensor,
size: var int, shape: openarray[int]|Metadata,
layout: static OrderType) =
## We don't use a proc directly due to https://github.com/nim-lang/Nim/issues/6529
result.shape = shape.toMetadata
mixin rank
result.strides.len = result.rank
size = 1
when layout == rowMajor:
for i in countdown(shape.len - 1, 0):
result.strides[i] = size
size *= shape[i]
elif layout == colMajor:
for i in 0 ..< shape.len:
result.strides[i] = size
size *= shape[i]
else:
{.error: "Unreachable, unknown layout".}
func initTensorMetadata*(
result: var Tensor,
size: var int, shape: openarray[int],
layout: static OrderType = rowMajor) =
## result metadata and size will be initialized in-place
initTensorMetadataImpl(result, size, shape, layout)
func initTensorMetadata*(
result: var Tensor,
size: var int, shape: Metadata,
layout: static OrderType = rowMajor) =
## result metadata and size will be initialized in-place
initTensorMetadataImpl(result, size, shape, layout)
proc deepCopy*[T](dst: var Tensor[T], src: Tensor[T]) =
## Performs a deep copy of y and copies it into x.
## Deepcopy is recursive including for ref types and custom types
## that implement deepCopy.
##
## Note that if x was already initialized with a ``storage``,
## the storage will be detached from x. This does not write
## into existing storage.
var size: int
initTensorMetadata(dst, size, src.shape)
dst.storage = allocCpuStorage(T, size)
when T is KnownSupportsCopyMem:
# We use memcpy, due to SIMD optimizations in memcpy,
# we require higher parallelization thresholds
if src.is_C_contiguous:
omp_parallel_chunks(
size, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
copyMem(
dst.unsafe_raw_offset[chunk_offset],
src.unsafe_raw_offset[chunk_offset],
chunk_size * sizeof(T)
)
else:
forEachStrided d in dst, s in src:
d = s
else:
# If the type doesn't supports memcpy,
# we assume we can't use OpenMP and we need
# recursive deepCopy
forEachSerial d in dst, s in src:
deepCopy(d, s) # recursive deepcopy
proc copyFrom*[T](dst: var Tensor[T], src: Tensor[T]) =
## Copy the source tensor into the destination tensor.
## Both should have the same shape. If destination tensor is a view
## only the data exposed by the view is modified.
##
## This is useful to update subslices of an existing tensor.
##
## ⚠️ Warning:
## The data exposed by the destination tensor will be overwritten.
## If destination tensor is a view, all views of that data will be changed.
## They however conserve their shape and strides.
##
## Note: The copy is not recursive.
when T is KnownSupportsCopyMem:
# We use memcpy, due to SIMD optimizations in memcpy,
# we require higher parallelization thresholds
if src.is_C_contiguous:
assert dst.shape == src.shape
omp_parallel_chunks(
src.size, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
copyMem(
dst.unsafe_raw_offset[chunk_offset].addr,
src.unsafe_raw_offset[chunk_offset].unsafeAddr,
chunk_size * sizeof(T)
)
else:
forEachStrided d in dst, s in src:
d = s
else:
# If the type doesn't supports memcpy,
# we assume we can't use OpenMP
forEachSerial d in dst, s in src:
d = s # non-recursive copy
proc copyFromRaw*[T](dst: var Tensor[T], buffer: ptr T, len: Natural) =
## Copy data from the buffer into the destination tensor.
## Destination tensor size and buffer length should be the same
when T is KnownSupportsCopyMem:
withCompilerOptimHints()
mixin size
# either this or `from ../../tensor/data_structure import size` is needed
# as can be seen with: `type tmp = typeof(Tensor[int].default.size)` which
# would fail at top-level
doAssert dst.size == len, "Tensor size and buffer length should be the same"
let buf{.restrict.} = cast[ptr UncheckedArray[T]](buffer)
omp_parallel_chunks(
len, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
copyMem(
dst.unsafe_raw_offset[chunk_offset].addr,
buf[chunk_offset].unsafeAddr,
chunk_size * sizeof(T)
)
else:
{.fatal: "Only non-ref types and types with trivial destructors can be raw copied.".}
proc setZero*[T](t: var Tensor[T], check_contiguous: static bool = true) =
## Reset/initialize the tensor data to binary zero.
## The tensor metadata is not touched.
## Input tensor must be contiguous. For seq based Tensors the underlying
## sequence will be reset and set back to the tensors size.
##
## ⚠️ Warning:
## The data of the input tensor will be overwritten.
## If destination tensor is a view, all views of that data will be changed.
## They however conserve their shape and strides.
when check_contiguous:
if unlikely(not t.is_C_contiguous):
# TODO: error model - https://github.com/numforge/laser/issues/2
# + If using exceptions, display the tensor ident with astToStr
raise newException(ValueError, "Input tensor is not contiguous.")
when not (T is KnownSupportsCopyMem):
t.storage.raw_buffer.reset()
t.storage.raw_buffer.setLen(t.size)
else:
mixin size
omp_parallel_chunks(
t.size, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
zeroMem(
t.unsafe_raw_offset[chunk_offset].addr,
chunk_size * sizeof(T)
)
proc newTensor*[T](shape: varargs[int]): Tensor[T] =
var size: int
initTensorMetadata(result, size, shape)
allocCpuStorage(result.storage, size)
when T is KnownSupportsCopyMem:
# seq based tensors are zero'ed by default upon construction
setZero(result, check_contiguous = false)
proc newTensor*[T](shape: Metadata): Tensor[T] =
var size: int
initTensorMetadata(result, size, shape)
allocCpuStorage(result.storage, size)
when T is KnownSupportsCopyMem:
# seq based tensors are zero'ed by default upon construction
setZero(result, check_contiguous = false)
proc toTensor*(a: openarray, dummy_bugfix: static[int] = 0): auto =
## Convert an openarray to a Tensor
## Input:
## - An array or a seq (can be nested)
## Result:
## - A Tensor of the same shape
##
## Note: dummy_bugfix param is unused and is a workaround a Nim bug.
# TODO: remove 'dummy_bugfix' - https://github.com/nim-lang/Nim/issues/6343
let
shape = getShape(a)
data = toSeq(flatIter(a))
if unlikely(shape.product != data.len):
raise newException(
IndexDefect,
"Each nested sequence at the same level" &
" must have the same number of elements"
)
type T = typeof(data[0])
var
t: Tensor[T]
size: int
initTensorMetadata(t, size, shape)
allocCpuStorage(t.storage, size)
when T is KnownSupportsCopyMem:
t.copyFromRaw(data[0].unsafeAddr, data.len)
else:
shallowCopy(t.storage.raw_buffer, data)
result = t
proc fromBuffer*[T](rawBuffer: ptr UncheckedArray[T], shape: varargs[int]): Tensor[T] =
## Creates a `Tensor[T]` from a raw buffer, cast as `ptr UncheckedArray[T]`. The
## size derived from the given shape must match the size of the buffer!
##
## If you type cast a raw `pointer` to `ptr UncheckedArray[T]` before handing it to this
## proc, make sure to cast to the correct type as we cannot check the validity of
## the type!
##
## Its counterpart ``toUnsafeView`` can be used to obtain ``ptr UncheckedArray`` from a Tensor.
var size: int
initTensorMetadata(result, size, shape)
cpuStorageFromBuffer(result.storage, rawBuffer, size)
proc fromBuffer*[T](rawBuffer: pointer, shape: varargs[int]): Tensor[T] =
## Creates a `Tensor[T]` from a raw `pointer`. Make sure that the explicit type
## given to this proc actually matches the data stored behind the pointer!
## The size derived from the given shape must match the size of the buffer!
##
## Its counterpart ``toUnsafeView`` can be used to obtain ``ptr UncheckedArray`` from a Tensor.
var size: int
initTensorMetadata(result, size, shape)
cpuStorageFromBuffer(result.storage, rawBuffer, size)
func toUnsafeView*[T: KnownSupportsCopyMem](t: Tensor[T], aligned: static bool = true): ptr UncheckedArray[T] {.inline.} =
## Returns an unsafe view of the valid data as a ``ptr UncheckedArray``.
## Its counterpart ``fromBuffer`` can be used to create a Tensor from``ptr UncheckedArray``.
##
## Unsafe: the pointer can outlive the input tensor.
unsafe_raw_offset(t, aligned).distinctBase()