Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
* OpenCL 🔥 initial commit 🎆 * 🔥 Brace yourself ❄️ OpenCL ❄️ is coming
- Loading branch information
Showing
16 changed files
with
590 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
# Copyright 2017 the Arraymancer contributors | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import ../data_structure, | ||
./opencl_global_state, | ||
./global_config, | ||
./metadataArray, | ||
nimcl, opencl, macros | ||
|
||
export nimcl, opencl, opencl_global_state | ||
|
||
|
||
# Data structures to ease interfacing with OpenCL and kernels | ||
|
||
proc toClpointer*[T](p: ptr T|ptr UncheckedArray[T]): PMem {.noSideEffect.}= | ||
cast[PMem](p) | ||
|
||
proc clMalloc*[T](size: Natural): ptr UncheckedArray[T] {.inline.}= | ||
## Internal proc. | ||
## Wrap OpenCL createBuffer | ||
cast[type result]( | ||
buffer[T](clContext0, size) | ||
) | ||
|
||
proc deallocCl*[T](p: ref[ptr UncheckedArray[T]]) {.noSideEffect.}= | ||
if not p[].isNil: | ||
check releaseMemObject p[].toClpointer | ||
|
||
# ############################################################## | ||
# # Base ClStorage type | ||
|
||
proc newClStorage*[T: SomeReal](length: int): ClStorage[T] = | ||
result.Flen = length | ||
new(result.Fref_tracking, deallocCl) | ||
result.Fdata = clMalloc[T](result.Flen) | ||
result.Fref_tracking[] = result.Fdata | ||
|
||
# ######################################################### | ||
# # Sending tensor layout to OpenCL Kernel | ||
|
||
type | ||
ClLayoutArray = ref[ptr UncheckedArray[cint]] | ||
## Reference to an array on the device | ||
# TODO: finalizer | ||
# or replace by a distinct type with a destructor | ||
|
||
ClTensorLayout [T: SomeReal] = object | ||
## Mimicks CudaTensor | ||
## Metadata stored on GPU or Accelerators | ||
|
||
rank*: cint # Number of dimension of the tensor | ||
shape*: ClLayoutArray | ||
strides*: ClLayoutArray | ||
offset*: cint | ||
data*: ptr T # Data on OpenCL device | ||
len*: cint # Number of elements allocated in memory | ||
|
||
proc layoutOnDevice*[T:SomeReal](t: ClTensor[T]): ClTensorLayout[T] = | ||
## Store a ClTensor shape, strides, etc information on the GPU | ||
# | ||
# TODO: instead of storing pointers to shape/stride/etc that are passed to each kernel | ||
# pass the layout object directly and call it with layout->shape, layout->rank | ||
|
||
result.rank = t.rank.cint | ||
|
||
result.offset = t.offset.cint | ||
result.data = t.get_data_ptr | ||
result.len = t.size.cint | ||
|
||
new result.shape, deallocCl | ||
new result.strides, deallocCl | ||
|
||
result.shape[] = clMalloc[cint](MAXRANK) | ||
result.strides[] = clMalloc[cint](MAXRANK) | ||
|
||
var | ||
tmp_shape: array[MAXRANK, cint] # ClLayoutArray | ||
tmp_strides: array[MAXRANK, cint] # ClLayoutArray | ||
|
||
for i in 0..<t.rank: | ||
tmp_shape[i] = t.shape[i].cint | ||
tmp_strides[i] = t.strides[i].cint | ||
|
||
|
||
# TODO: use streams and async | ||
let size = t.rank * sizeof(cint) | ||
check enqueueWriteBuffer( | ||
clQueue0, | ||
result.shape[].toClpointer, | ||
CL_false, # Non-blocking copy | ||
0, | ||
size, | ||
addr tmp_shape[0], | ||
0, nil, nil | ||
) | ||
|
||
check enqueueWriteBuffer( | ||
clQueue0, | ||
result.strides[].toClpointer, | ||
CL_true, # Blocking copy, we don't want tmp_strides (and tmp_shape) to disappear whil copy is pending | ||
0, | ||
size, | ||
addr tmp_strides[0], | ||
0, nil, nil | ||
) | ||
|
||
|
||
# ######################################################### | ||
# # Variadic number of args, to remove after https://github.com/unicredit/nimcl/pull/1 | ||
|
||
#### Taken from nimcl | ||
template setArg(kernel: PKernel, item: PMem, index: int) = | ||
var x = item | ||
check setKernelArg(kernel, index.uint32, sizeof(Pmem), addr x) | ||
|
||
template setArg[A](kernel: PKernel, item: var A, index: int) = | ||
check setKernelArg(kernel, index.uint32, sizeof(A), addr item) | ||
|
||
template setArg[A](kernel: PKernel, item: LocalBuffer[A], index: int) = | ||
check setKernelArg(kernel, index.uint32, int(item) * sizeof(A), nil) | ||
|
||
template setArg(kernel: PKernel, item: SomeInteger, index: int) = | ||
var x = item | ||
check setKernelArg(kernel, index.uint32, sizeof(type(item)), addr x) | ||
#### | ||
|
||
macro args*(kernel: Pkernel, args: varargs[untyped]): untyped = | ||
|
||
result = newStmtList() | ||
|
||
var i = 0 # no pairs for macro for loop | ||
for arg in items(args): | ||
let s = quote do: | ||
`kernel`.setArg(`arg`, `i`) | ||
result.add(s) | ||
inc i |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Copyright 2017 the Arraymancer contributors | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import nimcl, opencl | ||
|
||
# ################################################### | ||
# Global Cuda and CuBLAS state | ||
|
||
{.experimental.} | ||
|
||
type clResource = PCommandQueue | PKernel | PProgram | PMem | PContext | ||
|
||
proc `=destroy`*(clres: clResource) = | ||
release clres | ||
|
||
# TODO detect and use accelerators (FPGAs) or GPU by default | ||
# And allow switching OpenCL device. | ||
let (clDevice0*, clContext0*, clQueue0*) = singleDeviceDefaults() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# Copyright 2017 the Arraymancer contributors | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import ../private/sequninit, | ||
./private/p_init_opencl, | ||
./backend/opencl_backend, | ||
./data_structure, | ||
./init_cpu | ||
|
||
proc opencl*[T:SomeReal](t: Tensor[T]): ClTensor[T] {.noInit.}= | ||
## Convert a tensor on Cpu to a tensor on an OpenCL device. | ||
|
||
result = newClTensor[T](t.shape) | ||
|
||
let contig_t = t.asContiguous(rowMajor, force = true) | ||
let size = csize(result.size * sizeof(T)) | ||
|
||
check enqueueWriteBuffer( | ||
clQueue0, | ||
result.get_data_ptr.toClpointer, | ||
CL_true, # Blocking copy, we don't want contig_t to disappear while copy is pending | ||
0, | ||
size, | ||
contig_t.get_data_ptr.toClpointer, | ||
0, nil, nil | ||
) | ||
|
||
proc cpu*[T:SomeReal](t: ClTensor[T]): Tensor[T] {.noInit.}= | ||
## Convert a tensor on an OpenCL device to a tensor on Cpu. | ||
# We use blocking copy in this case to make sure | ||
# all data is available for future computation | ||
|
||
result.shape = t.shape | ||
result.strides = t.strides | ||
result.offset = t.offset | ||
result.data = newSeqUninit[T](t.storage.Flen) # We copy over all the memory allocated (without prior asContiguous) | ||
|
||
let size = t.storage.Flen * sizeof(T) | ||
|
||
check enqueueReadBuffer( | ||
clQueue0, | ||
t.get_data_ptr.toClpointer, | ||
CL_true, # Blocking copy, we don't want computation to continue while copy is still pending | ||
0, | ||
size, | ||
result.get_data_ptr.toClpointer, | ||
0, nil, nil | ||
) | ||
|
||
|
||
|
||
proc zeros_like*[T: SomeReal](t: ClTensor[T]): ClTensor[T] {.noInit, inline.} = | ||
## Creates a new ClTensor filled with 0 with the same shape as the input | ||
## Input: | ||
## - Shape of the CudaTensor | ||
## - Type of its elements | ||
## Result: | ||
## - A zero-ed ClTensor of the same shape | ||
|
||
# TODO use clEnqueueFillBuffer (OpenCL 1.2 only) | ||
result = zeros[T](t.shape).opencl | ||
|
||
proc ones_like*[T: SomeReal](t: ClTensor[T]): ClTensor[T] {.noInit, inline.} = | ||
## Creates a new ClTensor filled with 1 with the same shape as the input | ||
## and filled with 1 | ||
## Input: | ||
## - A CudaTensor | ||
## Result: | ||
## - A one-ed ClTensor of the same shape | ||
result = ones[T](t.shape).opencl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# Copyright 2017 the Arraymancer contributors | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import ./backend/opencl_backend, | ||
./private/p_kernels_interface_opencl, | ||
./private/p_init_opencl, | ||
./private/p_checks, | ||
./data_structure | ||
|
||
# Kernels are created just-in-time and incur some overhead | ||
# unfortunately doing otherwise would require | ||
# prebuilding binaries for each AMD, Nvidia, Intel, Qualcomm, ... OpenCL SDK and drivers | ||
# Nvidia automatically caches OpenCL JIT compilation. | ||
# For maximum performance you might need a similar scheme for your platform. | ||
|
||
template genAdd(T: typedesc, ctype: string): untyped = | ||
proc `+`*(a,b: ClTensor[T]): ClTensor[T] {.noInit.}= | ||
## ClTensor addition | ||
|
||
when compileOption("boundChecks"): | ||
check_elementwise(a,b) | ||
|
||
result = newClTensor[T](a.shape) | ||
|
||
let | ||
ocl_addKernel = gen_ocl_apply3("AddKernel", ctype, "+") | ||
program = clContext0.createAndBuild(ocl_addKernel, clDevice0) | ||
opencl_add = program.createKernel("AddKernel") | ||
|
||
dst = layoutOnDevice result | ||
src_a = layoutOnDevice a | ||
src_b = layoutOnDevice b | ||
|
||
opencl_add.args(dst.rank, dst.len, | ||
dst.shape[], dst.strides[], dst.offset, dst.data.toClpointer, | ||
src_a.shape[], src_a.strides[], src_a.offset, src_a.data.toClpointer, | ||
src_b.shape[], src_b.strides[], src_b.offset, src_b.data.toClpointer | ||
) | ||
|
||
clQueue0.run(opencl_add, result.size) | ||
|
||
genAdd(float32, "float") | ||
genAdd(float64, "double") |
Oops, something went wrong.