Skip to content

Commit

Permalink
🔥 OpenCL tensors ⚡ (#184). Closes #69
Browse files Browse the repository at this point in the history
* OpenCL 🔥 initial commit 🎆

* 🔥 Brace yourself ❄️ OpenCL ❄️ is coming
  • Loading branch information
mratsim committed Jan 14, 2018
1 parent fc4ad52 commit b061aef
Show file tree
Hide file tree
Showing 16 changed files with 590 additions and 15 deletions.
6 changes: 5 additions & 1 deletion arraymancer.nimble
Expand Up @@ -5,7 +5,7 @@ description = "A n-dimensional tensor (ndarray) library"
license = "Apache License 2.0"

### Dependencies
requires "nim >= 0.17.2", "nimblas >= 0.1.3", "nimcuda >= 0.1.4"
requires "nim >= 0.17.2", "nimblas >= 0.1.3", "nimcuda >= 0.1.4", "nimcl >= 0.1.1"

## Install files
srcDir = "src"
Expand Down Expand Up @@ -124,6 +124,10 @@ task test_cuda, "Run all tests - Cuda backend with CUBLAS and CuDNN":
# hence the need to call cudaSwitches explicitly
test "tests_cuda", "cpp"

task test_opencl, "Run all OpenCL backend tests":
switch("define", "opencl")
test "tests_opencl"

# task test_deprecated, "Run all tests on deprecated procs":
# test "tests_cpu_deprecated"

Expand Down
147 changes: 147 additions & 0 deletions src/tensor/backend/opencl_backend.nim
@@ -0,0 +1,147 @@
# Copyright 2017 the Arraymancer contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ../data_structure,
./opencl_global_state,
./global_config,
./metadataArray,
nimcl, opencl, macros

export nimcl, opencl, opencl_global_state


# Data structures to ease interfacing with OpenCL and kernels

proc toClpointer*[T](p: ptr T|ptr UncheckedArray[T]): PMem {.noSideEffect.}=
cast[PMem](p)

proc clMalloc*[T](size: Natural): ptr UncheckedArray[T] {.inline.}=
## Internal proc.
## Wrap OpenCL createBuffer
cast[type result](
buffer[T](clContext0, size)
)

proc deallocCl*[T](p: ref[ptr UncheckedArray[T]]) {.noSideEffect.}=
if not p[].isNil:
check releaseMemObject p[].toClpointer

# ##############################################################
# # Base ClStorage type

proc newClStorage*[T: SomeReal](length: int): ClStorage[T] =
result.Flen = length
new(result.Fref_tracking, deallocCl)
result.Fdata = clMalloc[T](result.Flen)
result.Fref_tracking[] = result.Fdata

# #########################################################
# # Sending tensor layout to OpenCL Kernel

type
ClLayoutArray = ref[ptr UncheckedArray[cint]]
## Reference to an array on the device
# TODO: finalizer
# or replace by a distinct type with a destructor

ClTensorLayout [T: SomeReal] = object
## Mimicks CudaTensor
## Metadata stored on GPU or Accelerators

rank*: cint # Number of dimension of the tensor
shape*: ClLayoutArray
strides*: ClLayoutArray
offset*: cint
data*: ptr T # Data on OpenCL device
len*: cint # Number of elements allocated in memory

proc layoutOnDevice*[T:SomeReal](t: ClTensor[T]): ClTensorLayout[T] =
## Store a ClTensor shape, strides, etc information on the GPU
#
# TODO: instead of storing pointers to shape/stride/etc that are passed to each kernel
# pass the layout object directly and call it with layout->shape, layout->rank

result.rank = t.rank.cint

result.offset = t.offset.cint
result.data = t.get_data_ptr
result.len = t.size.cint

new result.shape, deallocCl
new result.strides, deallocCl

result.shape[] = clMalloc[cint](MAXRANK)
result.strides[] = clMalloc[cint](MAXRANK)

var
tmp_shape: array[MAXRANK, cint] # ClLayoutArray
tmp_strides: array[MAXRANK, cint] # ClLayoutArray

for i in 0..<t.rank:
tmp_shape[i] = t.shape[i].cint
tmp_strides[i] = t.strides[i].cint


# TODO: use streams and async
let size = t.rank * sizeof(cint)
check enqueueWriteBuffer(
clQueue0,
result.shape[].toClpointer,
CL_false, # Non-blocking copy
0,
size,
addr tmp_shape[0],
0, nil, nil
)

check enqueueWriteBuffer(
clQueue0,
result.strides[].toClpointer,
CL_true, # Blocking copy, we don't want tmp_strides (and tmp_shape) to disappear whil copy is pending
0,
size,
addr tmp_strides[0],
0, nil, nil
)


# #########################################################
# # Variadic number of args, to remove after https://github.com/unicredit/nimcl/pull/1

#### Taken from nimcl
template setArg(kernel: PKernel, item: PMem, index: int) =
var x = item
check setKernelArg(kernel, index.uint32, sizeof(Pmem), addr x)

template setArg[A](kernel: PKernel, item: var A, index: int) =
check setKernelArg(kernel, index.uint32, sizeof(A), addr item)

template setArg[A](kernel: PKernel, item: LocalBuffer[A], index: int) =
check setKernelArg(kernel, index.uint32, int(item) * sizeof(A), nil)

template setArg(kernel: PKernel, item: SomeInteger, index: int) =
var x = item
check setKernelArg(kernel, index.uint32, sizeof(type(item)), addr x)
####

macro args*(kernel: Pkernel, args: varargs[untyped]): untyped =

result = newStmtList()

var i = 0 # no pairs for macro for loop
for arg in items(args):
let s = quote do:
`kernel`.setArg(`arg`, `i`)
result.add(s)
inc i
30 changes: 30 additions & 0 deletions src/tensor/backend/opencl_global_state.nim
@@ -0,0 +1,30 @@
# Copyright 2017 the Arraymancer contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import nimcl, opencl

# ###################################################
# Global Cuda and CuBLAS state

{.experimental.}

type clResource = PCommandQueue | PKernel | PProgram | PMem | PContext

proc `=destroy`*(clres: clResource) =
release clres

# TODO detect and use accelerators (FPGAs) or GPU by default
# And allow switching OpenCL device.
let (clDevice0*, clContext0*, clQueue0*) = singleDeviceDefaults()

33 changes: 23 additions & 10 deletions src/tensor/data_structure.nim
Expand Up @@ -18,14 +18,6 @@ import ./backend/metadataArray,
export nimblas.OrderType

type
Backend* = enum
## ``Backend`` for tensor computation and memory allocation.
##
##
## Only deprecated procs from v0.1.3 uses this for the moment.
Cpu,
Cuda

CpuStorage* {.shallow.} [T] = object
## Opaque data storage for Tensors
## Currently implemented as a seq with reference semantics (shallow copy on assignment).
Expand Down Expand Up @@ -55,7 +47,6 @@ type
offset*: int
storage*: CpuStorage[T]

type
CudaStorage*[T: SomeReal] = object
## Opaque seq-like structure for storage on the Cuda backend.
##
Expand All @@ -82,7 +73,29 @@ type
offset*: int
storage*: CudaStorage[T]

AnyTensor*[T] = Tensor[T] or CudaTensor[T]
ClStorage*[T: SomeReal] = object
## Opaque seq-like structure for storage on the OpenCL backend.
Flen*: int
Fdata*: ptr UncheckedArray[T]
Fref_tracking*: ref[ptr UncheckedArray[T]] # We keep ref tracking for the GC in a separate field to avoid double indirection.

ClTensor*[T: SomeReal] = object
## Tensor data structure stored on OpenCL (CPU, GPU, FPGAs or other accelerators)
## - ``shape``: Dimensions of the CudaTensor
## - ``strides``: Numbers of items to skip to get the next item along a dimension.
## - ``offset``: Offset to get the first item of the CudaTensor. Note: offset can be negative, in particular for slices.
## - ``storage``: An opaque data storage for the CudaTensor
##
## Warning ⚠:
## Assignment ```var a = b``` does not copy the data. Data modification on one CudaTensor will be reflected on the other.
## However modification on metadata (shape, strides or offset) will not affect the other tensor.
## Explicit copies can be made with ``clone``: ```var a = b.clone```
shape*: MetadataArray
strides*: MetadataArray
offset*: int
storage*: ClStorage[T]

AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T]

# ###############
# Field accessors
Expand Down
81 changes: 81 additions & 0 deletions src/tensor/init_opencl.nim
@@ -0,0 +1,81 @@
# Copyright 2017 the Arraymancer contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ../private/sequninit,
./private/p_init_opencl,
./backend/opencl_backend,
./data_structure,
./init_cpu

proc opencl*[T:SomeReal](t: Tensor[T]): ClTensor[T] {.noInit.}=
## Convert a tensor on Cpu to a tensor on an OpenCL device.

result = newClTensor[T](t.shape)

let contig_t = t.asContiguous(rowMajor, force = true)
let size = csize(result.size * sizeof(T))

check enqueueWriteBuffer(
clQueue0,
result.get_data_ptr.toClpointer,
CL_true, # Blocking copy, we don't want contig_t to disappear while copy is pending
0,
size,
contig_t.get_data_ptr.toClpointer,
0, nil, nil
)

proc cpu*[T:SomeReal](t: ClTensor[T]): Tensor[T] {.noInit.}=
## Convert a tensor on an OpenCL device to a tensor on Cpu.
# We use blocking copy in this case to make sure
# all data is available for future computation

result.shape = t.shape
result.strides = t.strides
result.offset = t.offset
result.data = newSeqUninit[T](t.storage.Flen) # We copy over all the memory allocated (without prior asContiguous)

let size = t.storage.Flen * sizeof(T)

check enqueueReadBuffer(
clQueue0,
t.get_data_ptr.toClpointer,
CL_true, # Blocking copy, we don't want computation to continue while copy is still pending
0,
size,
result.get_data_ptr.toClpointer,
0, nil, nil
)



proc zeros_like*[T: SomeReal](t: ClTensor[T]): ClTensor[T] {.noInit, inline.} =
## Creates a new ClTensor filled with 0 with the same shape as the input
## Input:
## - Shape of the CudaTensor
## - Type of its elements
## Result:
## - A zero-ed ClTensor of the same shape

# TODO use clEnqueueFillBuffer (OpenCL 1.2 only)
result = zeros[T](t.shape).opencl

proc ones_like*[T: SomeReal](t: ClTensor[T]): ClTensor[T] {.noInit, inline.} =
## Creates a new ClTensor filled with 1 with the same shape as the input
## and filled with 1
## Input:
## - A CudaTensor
## Result:
## - A one-ed ClTensor of the same shape
result = ones[T](t.shape).opencl
55 changes: 55 additions & 0 deletions src/tensor/operators_blas_l1_opencl.nim
@@ -0,0 +1,55 @@
# Copyright 2017 the Arraymancer contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import ./backend/opencl_backend,
./private/p_kernels_interface_opencl,
./private/p_init_opencl,
./private/p_checks,
./data_structure

# Kernels are created just-in-time and incur some overhead
# unfortunately doing otherwise would require
# prebuilding binaries for each AMD, Nvidia, Intel, Qualcomm, ... OpenCL SDK and drivers
# Nvidia automatically caches OpenCL JIT compilation.
# For maximum performance you might need a similar scheme for your platform.

template genAdd(T: typedesc, ctype: string): untyped =
proc `+`*(a,b: ClTensor[T]): ClTensor[T] {.noInit.}=
## ClTensor addition

when compileOption("boundChecks"):
check_elementwise(a,b)

result = newClTensor[T](a.shape)

let
ocl_addKernel = gen_ocl_apply3("AddKernel", ctype, "+")
program = clContext0.createAndBuild(ocl_addKernel, clDevice0)
opencl_add = program.createKernel("AddKernel")

dst = layoutOnDevice result
src_a = layoutOnDevice a
src_b = layoutOnDevice b

opencl_add.args(dst.rank, dst.len,
dst.shape[], dst.strides[], dst.offset, dst.data.toClpointer,
src_a.shape[], src_a.strides[], src_a.offset, src_a.data.toClpointer,
src_b.shape[], src_b.strides[], src_b.offset, src_b.data.toClpointer
)

clQueue0.run(opencl_add, result.size)

genAdd(float32, "float")
genAdd(float64, "double")

0 comments on commit b061aef

Please sign in to comment.