🔥 OpenCL tensors ⚡ (#184). Closes #69

* OpenCL 🔥 initial commit 🎆 * 🔥 Brace yourself ❄️ OpenCL ❄️ is coming
mratsim · Jan 14, 2018 · b061aef · b061aef
1 parent fc4ad52
commit b061aef
Show file tree

Hide file tree

Showing 16 changed files with 590 additions and 15 deletions.
diff --git a/arraymancer.nimble b/arraymancer.nimble
@@ -5,7 +5,7 @@ description   = "A n-dimensional tensor (ndarray) library"
 license       = "Apache License 2.0"
 
 ### Dependencies
-requires "nim >= 0.17.2", "nimblas >= 0.1.3", "nimcuda >= 0.1.4"
+requires "nim >= 0.17.2", "nimblas >= 0.1.3", "nimcuda >= 0.1.4", "nimcl >= 0.1.1"
 
 ## Install files
 srcDir = "src"
@@ -124,6 +124,10 @@ task test_cuda, "Run all tests - Cuda backend with CUBLAS and CuDNN":
                 # hence the need to call cudaSwitches explicitly
   test "tests_cuda", "cpp"
 
+task test_opencl, "Run all OpenCL backend tests":
+  switch("define", "opencl")
+  test "tests_opencl"
+
 # task test_deprecated, "Run all tests on deprecated procs":
 #  test "tests_cpu_deprecated"
 

diff --git a/src/tensor/backend/opencl_backend.nim b/src/tensor/backend/opencl_backend.nim
@@ -0,0 +1,147 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import  ../data_structure,
+        ./opencl_global_state,
+        ./global_config,
+        ./metadataArray,
+        nimcl, opencl, macros
+
+export nimcl, opencl, opencl_global_state
+
+
+# Data structures to ease interfacing with OpenCL and kernels
+
+proc toClpointer*[T](p: ptr T|ptr UncheckedArray[T]): PMem {.noSideEffect.}=
+  cast[PMem](p)
+
+proc clMalloc*[T](size: Natural): ptr UncheckedArray[T] {.inline.}=
+  ## Internal proc.
+  ## Wrap OpenCL createBuffer
+  cast[type result](
+    buffer[T](clContext0, size)
+  )
+
+proc deallocCl*[T](p: ref[ptr UncheckedArray[T]]) {.noSideEffect.}=
+  if not p[].isNil:
+    check releaseMemObject p[].toClpointer
+
+# ##############################################################
+# # Base ClStorage type
+
+proc newClStorage*[T: SomeReal](length: int): ClStorage[T] =
+  result.Flen = length
+  new(result.Fref_tracking, deallocCl)
+  result.Fdata = clMalloc[T](result.Flen)
+  result.Fref_tracking[] = result.Fdata
+
+# #########################################################
+# # Sending tensor layout to OpenCL Kernel
+
+type
+  ClLayoutArray = ref[ptr UncheckedArray[cint]]
+    ## Reference to an array on the device
+    # TODO: finalizer
+    # or replace by a distinct type with a destructor
+
+  ClTensorLayout [T: SomeReal] = object
+    ## Mimicks CudaTensor
+    ## Metadata stored on GPU or Accelerators
+
+    rank*: cint               # Number of dimension of the tensor
+    shape*: ClLayoutArray
+    strides*: ClLayoutArray
+    offset*: cint
+    data*: ptr T              # Data on OpenCL device
+    len*: cint                # Number of elements allocated in memory
+
+proc layoutOnDevice*[T:SomeReal](t: ClTensor[T]): ClTensorLayout[T] =
+  ## Store a ClTensor shape, strides, etc information on the GPU
+  #
+  # TODO: instead of storing pointers to shape/stride/etc that are passed to each kernel
+  # pass the layout object directly and call it with layout->shape, layout->rank
+
+  result.rank = t.rank.cint
+
+  result.offset = t.offset.cint
+  result.data = t.get_data_ptr
+  result.len = t.size.cint
+
+  new result.shape, deallocCl
+  new result.strides, deallocCl
+
+  result.shape[] = clMalloc[cint](MAXRANK)
+  result.strides[] = clMalloc[cint](MAXRANK)
+
+  var
+    tmp_shape: array[MAXRANK, cint] # ClLayoutArray
+    tmp_strides: array[MAXRANK, cint] # ClLayoutArray
+
+  for i in 0..<t.rank:
+    tmp_shape[i] = t.shape[i].cint
+    tmp_strides[i] = t.strides[i].cint
+
+
+  # TODO: use streams and async
+  let size = t.rank * sizeof(cint)
+  check enqueueWriteBuffer(
+    clQueue0,
+    result.shape[].toClpointer,
+    CL_false, # Non-blocking copy
+    0,
+    size,
+    addr tmp_shape[0],
+    0, nil, nil
+  )
+
+  check enqueueWriteBuffer(
+    clQueue0,
+    result.strides[].toClpointer,
+    CL_true, # Blocking copy, we don't want tmp_strides (and tmp_shape) to disappear whil copy is pending
+    0,
+    size,
+    addr tmp_strides[0],
+    0, nil, nil
+  )
+
+
+# #########################################################
+# # Variadic number of args, to remove after https://github.com/unicredit/nimcl/pull/1
+
+#### Taken from nimcl
+template setArg(kernel: PKernel, item: PMem, index: int) =
+  var x = item
+  check setKernelArg(kernel, index.uint32, sizeof(Pmem), addr x)
+
+template setArg[A](kernel: PKernel, item: var A, index: int) =
+  check setKernelArg(kernel, index.uint32, sizeof(A), addr item)
+
+template setArg[A](kernel: PKernel, item: LocalBuffer[A], index: int) =
+  check setKernelArg(kernel, index.uint32, int(item) * sizeof(A), nil)
+
+template setArg(kernel: PKernel, item: SomeInteger, index: int) =
+  var x = item
+  check setKernelArg(kernel, index.uint32, sizeof(type(item)), addr x)
+####
+
+macro args*(kernel: Pkernel, args: varargs[untyped]): untyped =
+
+  result = newStmtList()
+
+  var i = 0 # no pairs for macro for loop
+  for arg in items(args):
+    let s = quote do:
+      `kernel`.setArg(`arg`, `i`)
+    result.add(s)
+    inc i
diff --git a/src/tensor/backend/opencl_global_state.nim b/src/tensor/backend/opencl_global_state.nim
@@ -0,0 +1,30 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nimcl, opencl
+
+# ###################################################
+# Global Cuda and CuBLAS state
+
+{.experimental.}
+
+type clResource = PCommandQueue | PKernel | PProgram | PMem | PContext
+
+proc `=destroy`*(clres: clResource) =
+  release clres
+
+# TODO detect and use accelerators (FPGAs) or GPU by default
+# And allow switching OpenCL device.
+let (clDevice0*, clContext0*, clQueue0*) = singleDeviceDefaults()
+
diff --git a/src/tensor/data_structure.nim b/src/tensor/data_structure.nim
@@ -18,14 +18,6 @@ import  ./backend/metadataArray,
 export nimblas.OrderType
 
 type
-  Backend* = enum
-    ## ``Backend`` for tensor computation and memory allocation.
-    ##
-    ##
-    ## Only deprecated procs from v0.1.3 uses this for the moment.
-    Cpu,
-    Cuda
-
   CpuStorage* {.shallow.} [T] = object
     ## Opaque data storage for Tensors
     ## Currently implemented as a seq with reference semantics (shallow copy on assignment).
@@ -55,7 +47,6 @@ type
     offset*: int
     storage*: CpuStorage[T]
 
-type
   CudaStorage*[T: SomeReal] = object
     ## Opaque seq-like structure for storage on the Cuda backend.
     ##
@@ -82,7 +73,29 @@ type
     offset*: int
     storage*: CudaStorage[T]
 
-  AnyTensor*[T] = Tensor[T] or CudaTensor[T]
+  ClStorage*[T: SomeReal] = object
+    ## Opaque seq-like structure for storage on the OpenCL backend.
+    Flen*: int
+    Fdata*: ptr UncheckedArray[T]
+    Fref_tracking*: ref[ptr UncheckedArray[T]] # We keep ref tracking for the GC in a separate field to avoid double indirection.
+
+  ClTensor*[T: SomeReal] = object
+    ## Tensor data structure stored on OpenCL (CPU, GPU, FPGAs or other accelerators)
+    ##   - ``shape``: Dimensions of the CudaTensor
+    ##   - ``strides``: Numbers of items to skip to get the next item along a dimension.
+    ##   - ``offset``: Offset to get the first item of the CudaTensor. Note: offset can be negative, in particular for slices.
+    ##   - ``storage``: An opaque data storage for the CudaTensor
+    ##
+    ## Warning ⚠:
+    ##   Assignment ```var a = b``` does not copy the data. Data modification on one CudaTensor will be reflected on the other.
+    ##   However modification on metadata (shape, strides or offset) will not affect the other tensor.
+    ##   Explicit copies can be made with ``clone``: ```var a = b.clone```
+    shape*: MetadataArray
+    strides*: MetadataArray
+    offset*: int
+    storage*: ClStorage[T]
+
+  AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T]
 
 # ###############
 # Field accessors

diff --git a/src/tensor/init_opencl.nim b/src/tensor/init_opencl.nim
@@ -0,0 +1,81 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import  ../private/sequninit,
+        ./private/p_init_opencl,
+        ./backend/opencl_backend,
+        ./data_structure,
+        ./init_cpu
+
+proc opencl*[T:SomeReal](t: Tensor[T]): ClTensor[T] {.noInit.}=
+  ## Convert a tensor on Cpu to a tensor on an OpenCL device.
+
+  result = newClTensor[T](t.shape)
+
+  let contig_t = t.asContiguous(rowMajor, force = true)
+  let size = csize(result.size * sizeof(T))
+
+  check enqueueWriteBuffer(
+    clQueue0,
+    result.get_data_ptr.toClpointer,
+    CL_true, # Blocking copy, we don't want contig_t to disappear while copy is pending
+    0,
+    size,
+    contig_t.get_data_ptr.toClpointer,
+    0, nil, nil
+  )
+
+proc cpu*[T:SomeReal](t: ClTensor[T]): Tensor[T] {.noInit.}=
+  ## Convert a tensor on an OpenCL device to a tensor on Cpu.
+  # We use blocking copy in this case to make sure
+  # all data is available for future computation
+
+  result.shape = t.shape
+  result.strides = t.strides
+  result.offset = t.offset
+  result.data = newSeqUninit[T](t.storage.Flen) # We copy over all the memory allocated (without prior asContiguous)
+
+  let size = t.storage.Flen * sizeof(T)
+
+  check enqueueReadBuffer(
+    clQueue0,
+    t.get_data_ptr.toClpointer,
+    CL_true, # Blocking copy, we don't want computation to continue while copy is still pending
+    0,
+    size,
+    result.get_data_ptr.toClpointer,
+    0, nil, nil
+  )
+
+
+
+proc zeros_like*[T: SomeReal](t: ClTensor[T]): ClTensor[T] {.noInit, inline.} =
+  ## Creates a new ClTensor filled with 0 with the same shape as the input
+  ## Input:
+  ##      - Shape of the CudaTensor
+  ##      - Type of its elements
+  ## Result:
+  ##      - A zero-ed ClTensor of the same shape
+
+  # TODO use clEnqueueFillBuffer (OpenCL 1.2 only)
+  result = zeros[T](t.shape).opencl
+
+proc ones_like*[T: SomeReal](t: ClTensor[T]): ClTensor[T] {.noInit, inline.} =
+  ## Creates a new ClTensor filled with 1 with the same shape as the input
+  ## and filled with 1
+  ## Input:
+  ##      - A CudaTensor
+  ## Result:
+  ##      - A one-ed ClTensor of the same shape
+  result = ones[T](t.shape).opencl
diff --git a/src/tensor/operators_blas_l1_opencl.nim b/src/tensor/operators_blas_l1_opencl.nim
@@ -0,0 +1,55 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import  ./backend/opencl_backend,
+        ./private/p_kernels_interface_opencl,
+        ./private/p_init_opencl,
+        ./private/p_checks,
+        ./data_structure
+
+# Kernels are created just-in-time and incur some overhead
+# unfortunately doing otherwise would require
+# prebuilding binaries for each AMD, Nvidia, Intel, Qualcomm, ... OpenCL SDK and drivers
+# Nvidia automatically caches OpenCL JIT compilation.
+# For maximum performance you might need a similar scheme for your platform.
+
+template genAdd(T: typedesc, ctype: string): untyped =
+  proc `+`*(a,b: ClTensor[T]): ClTensor[T] {.noInit.}=
+    ## ClTensor addition
+
+    when compileOption("boundChecks"):
+      check_elementwise(a,b)
+
+    result = newClTensor[T](a.shape)
+
+    let
+      ocl_addKernel = gen_ocl_apply3("AddKernel", ctype, "+")
+      program = clContext0.createAndBuild(ocl_addKernel, clDevice0)
+      opencl_add = program.createKernel("AddKernel")
+
+      dst = layoutOnDevice result
+      src_a = layoutOnDevice a
+      src_b = layoutOnDevice b
+
+    opencl_add.args(dst.rank, dst.len,
+                        dst.shape[], dst.strides[], dst.offset, dst.data.toClpointer,
+                        src_a.shape[], src_a.strides[], src_a.offset, src_a.data.toClpointer,
+                        src_b.shape[], src_b.strides[], src_b.offset, src_b.data.toClpointer
+                        )
+
+    clQueue0.run(opencl_add, result.size)
+
+genAdd(float32, "float")
+genAdd(float64, "double")