mratsim · mratsim · Nov 27, 2017 · Nov 25, 2017 · Nov 25, 2017 · Nov 25, 2017
diff --git a/changelog.md b/changelog.md
@@ -1,3 +1,30 @@
+Arraymancer v0.3.0
+==========================
+
+I am very excited to announce the second release of Arraymancer which includes numerous improvements and breaking changes.
+WARNING: Deprecated proc will be removed in a new release in a week due to deprecated spam.
+
+Note:
+- zeros, ones, newTensor
+
+- **Very** Breaking
+  - Tensors uses reference semantics now: `let a = b` will share data by default and copies must be made explicitly.
+    - There is no need to use `unsafe` proc to avoid copies especially for slices.
+    - Unsafe procs are deprecated and will be removed leading to a smaller and simpler codebase and API/documentation.
+    - Tensors and CudaTensors now works the same way.
+    - Use `clone` to do copies.
+    - Arraymancer now works like Numpy and Julia, making it easier to port code.
+    - Unfortunately it makes it harder to debug unexpected data sharing.
+
+- Deprecated
+  - Version 0.3.1 with the ALL deprecated proc removed will be released in a week. Due to issue https://github.com/nim-lang/Nim/issues/6436,
+    even using non-deprecated proc like `zeros`, `ones`, `newTensor` you will get a deprecated warning.
+  - `newTensor`, `zeros`, `ones` arguments have been changed from `zeros([5, 5], int)` to `zeros[int]([5, 5])`
+  - All `unsafe` proc are now default and deprecated.
+
+
+- Cuda:
+  - Support for convolution forward and backward
 
 
 Arraymancer v0.2.0 Sept. 24, 2017 "The Color of Magic"

diff --git a/src/autograd/ag_accessors.nim b/src/autograd/ag_accessors.nim
@@ -21,8 +21,8 @@ template `[]`*[TT](v: Variable[TT], args: varargs[untyped]): Variable[TT] =
 
   result.tape = v.tape
   result.ancestor = v.ancestor
-  result.value = v.value.unsafeSlice(args)
-  result.grad = v.grad.unsafeSlice(args)
+  result.value = v.value[args]
+  result.grad = v.grad[args]
 
   result
 

diff --git a/src/autograd/gates_blas.nim b/src/autograd/gates_blas.nim
@@ -29,8 +29,8 @@ method forward*[TT](self: MatMulGate[TT], a, b: Variable[TT]): Variable[TT] {.in
   result.grad = zeros[getSubType(TT)](result.value.shape)
 
 method backward*[TT](self: MatMulGate[TT], gradient: TT): SmallDiffs[TT] {.noInit, inline, locks:0.}=
-  result[0] = gradient * self.b.value.unsafeTranspose
-  result[1] = self.a.value.unsafeTranspose * gradient
+  result[0] = gradient * self.b.value.transpose
+  result[1] = self.a.value.transpose * gradient
 
 proc `*`*[TT](a, b: Variable[TT]): Variable[TT] =
   when compileOption("boundChecks"):

diff --git a/src/autograd/gates_reduce.nim b/src/autograd/gates_reduce.nim
@@ -34,7 +34,7 @@ method backward*[TT](self: MeanGate[TT], gradient: TT): SmallDiffs[TT] {.noInit,
   result[0] = gradient / getSubType(TT)(self.a_shape.product) # Conversion to subtype T, oh Higher kinded-types ...
 
   let z_shape = newSeqWith(self.a_shape.len, 1) # We create a shape of 1 dimension that we will expand with broadcast
-  result[0] = result[0].unsafeReshape(z_shape).unsafeBroadcast(self.a_shape)
+  result[0] = result[0].reshape(z_shape).broadcast(self.a_shape)
 
 proc mean*[TT](a: Variable[TT]): Variable[TT] =
   when compileOption("boundChecks"):

diff --git a/src/nn/activation/relu.nim b/src/nn/activation/relu.nim
@@ -54,4 +54,4 @@ proc relu*[TT](a: Variable[TT]): Variable[TT] =
   node.child = result
 
   # Caching for backprop
-  gate.cache = result.value.unsafeView
+  gate.cache = result.value
diff --git a/src/nn/layers/linear.nim b/src/nn/layers/linear.nim
@@ -33,8 +33,8 @@ method forward*[TT](self: LinearGate[TT], a: Variable[TT]): Variable[TT] {.inlin
   result.grad = zeros_like(result.value)
 
 method backward*[TT](self: LinearGate[TT], gradient: TT): SmallDiffs[TT] {.noInit, inline, locks:0.}=
-  result[0] = self.W.value.unsafeTranspose * gradient # grad w.r.t. x
-  result[1] = gradient * self.x.value.unsafeTranspose # grad w.r.t. weight
+  result[0] = self.W.value.transpose * gradient # grad w.r.t. x
+  result[1] = gradient * self.x.value.transpose # grad w.r.t. weight
 
   if not self.b.isNil:
     result[2] = sum(gradient, axis=0) # grad w.r.t. bias

diff --git a/src/nn/loss/sigmoid_cross_entropy.nim b/src/nn/loss/sigmoid_cross_entropy.nim
@@ -30,7 +30,7 @@ method forward*[TT](self: SigmoidCrossEntropyLoss[TT], a: Variable[TT], target:
   result.tape = a.tape
 
   # TODO: implement a Scalar[T] concept instead of rewrapping the result into a Tensor
-  result.value = [sigmoid_cross_entropy(a.value, target)].toTensor.unsafeView
+  result.value = [sigmoid_cross_entropy(a.value, target)].toTensor
 
   result.grad = zeros[getSubType(TT)](1)
 
@@ -44,7 +44,7 @@ proc sigmoid_cross_entropy*[TT](a: Variable[TT], target: TT): Variable[TT] =
   new gate
   gate.arity = 1
   gate.cache = a
-  gate.target = target.unsafeView
+  gate.target = target
 
   # Node
   var node: Node[TT]

diff --git a/src/nn_primitives/backend/nnpack_interface.nim b/src/nn_primitives/backend/nnpack_interface.nim
@@ -26,8 +26,8 @@ proc nnpack_conv2d*(input, weight, bias: Tensor[float32], padding, stride: Size2
     output_width = (2*padding.width + input.nchw_width) - (weight.nchw_width - 1)
 
   # Make sure the data is contiguous before passing to nnpack
-  let input = input.unsafeContiguous()
-  let weight = weight.unsafeContiguous()
+  let input = input.asContiguous()
+  let weight = weight.asContiguous()
   var bias_nonnil: Tensor[float32] # TODO make bias truly optional and not just a tensor of rank 0
 
 
@@ -36,7 +36,7 @@ proc nnpack_conv2d*(input, weight, bias: Tensor[float32], padding, stride: Size2
     # Temporary bias filled with zeros just to pass to nnpack
     bias_nonnil = zeros[float32](output_channels)
   else:
-    bias_nonnil = bias.unsafeContiguous()
+    bias_nonnil = bias.asContiguous()
 
   # Prepare tensor that the result will be stored on
   result = newTensorUninit[float32](input.shape[0], output_channels, output_height, output_width)

diff --git a/src/nn_primitives/fallback/conv.nim b/src/nn_primitives/fallback/conv.nim
@@ -91,19 +91,20 @@ proc im2colgemm_conv2d*[T](input, kernel, bias: Tensor[T],
     output_height = (input.nchw_height + (2*padding.height) - kernel.nchw_height) div stride.height + 1
     output_width = (input.nchw_width + (2*padding.width) - kernel.nchw_width) div stride.width + 1
     channels_col = input.nchw_channels * kernel.nchw_height * kernel.nchw_width
-    kernel_col = kernel.unsafeReshape(output_channels, channels_col)
+    kernel_col = kernel.reshape(output_channels, channels_col)
 
   result = newTensorUninit[T](batch_size, output_channels, output_height, output_width)
   var input_col = newTensorUninit[T](channels_col, output_height * output_width)
   var output: Tensor[T]
 
-  for i in 0..<batch_size:
-    im2col(input.unsafeAtAxisIndex(0, i).unsafeSqueeze(0), kernel_size, padding, stride, input_col)
-    output = result.unsafeAtAxisIndex(0, i).unsafeReshape(kernel_col.shape[0], input_col.shape[1])
+  for i in 0..<batch_size: #TODO: batch matmul
+    im2col(input.atAxisIndex(0, i).squeeze(0), kernel_size, padding, stride, input_col)
+    # The following must be done without copy: GEMM will directly write in the result tensor
+    output = result.atAxisIndex(0, i).reshape(kernel_col.shape[0], input_col.shape[1])
     gemm(kernel_col, input_col, output)
 
   if bias.rank > 0:
-    result .+= bias.unsafeUnsqueeze(0)
+    result .+= bias.unsqueeze(0)
 
 proc im2colgemm_conv2d_gradient*[T](input, kernel: Tensor[T],
                          padding: Size2D = (0,0),
@@ -119,7 +120,7 @@ proc im2colgemm_conv2d_gradient*[T](input, kernel: Tensor[T],
     output_width = (input.nchw_width + (2*padding.width) - kernel.nchw_width) div stride.width + 1
     output_flatten_size = output_height*output_width
     channels_col = input.nchw_channels * kernel_size.height * kernel_size.width
-    kernel_col = kernel.unsafeReshape(output_channels, input.nchw_channels*kernel.nchw_height*kernel.nchw_width)
+    kernel_col = kernel.reshape(output_channels, input.nchw_channels*kernel.nchw_height*kernel.nchw_width)
 
   # Check if grad output shape looks correct
   assert grad_output.nchw_width == output_width and grad_output.nchw_height == output_height
@@ -132,9 +133,9 @@ proc im2colgemm_conv2d_gradient*[T](input, kernel: Tensor[T],
 
   for i in 0..<batch_size:
     let
-      grad_output_col = grad_output.unsafeAtAxisIndex(0, i).unsafeReshape(output_channels, output_flatten_size)
-      grad_input_col = kernel_col.unsafeTranspose() * grad_output_col
+      grad_output_col = grad_output.atAxisIndex(0, i).reshape(output_channels, output_flatten_size)
+      grad_input_col = kernel_col.transpose() * grad_output_col
 
-    im2col(input.unsafeAtAxisIndex(0, i).unsafeSqueeze(0), kernel_size, padding, stride, input_col)
-    grad_input[i, _, _, _] = col2im(grad_input_col, input.nchw_channels, input.nchw_height, input.nchw_width, kernel_size, padding, stride).unsafeUnsqueeze(0)
-    grad_weight += (grad_output_col * input_col.unsafeTranspose()).unsafeReshape(grad_weight.shape)
+    im2col(input.atAxisIndex(0, i).squeeze(0), kernel_size, padding, stride, input_col)
+    grad_input[i, _, _, _] = col2im(grad_input_col, input.nchw_channels, input.nchw_height, input.nchw_width, kernel_size, padding, stride).unsqueeze(0)
+    grad_weight += (grad_output_col * input_col.transpose()).reshape(grad_weight.shape)
diff --git a/src/nn_primitives/nnp_conv2d_cudnn.nim b/src/nn_primitives/nnp_conv2d_cudnn.nim
@@ -52,19 +52,19 @@ proc conv2d*[T: SomeReal](input, kernel, bias: CudaTensor[T],
     defaultHandle_cudnn,
     addr alpha,
     srcTensorDesc,
-    input.data.data[],
+    input.get_offset_ptr,
     kernelDesc,
-    kernel.data.data[],
+    kernel.get_offset_ptr,
     convDesc,
     algo_workspace.algo,
     algo_workspace.workspace[],
     algo_workspace.sizeInBytes,
     addr beta,
     dstTensorDesc,
-    result.data.data[]
+    result.get_offset_ptr
   )
 
-  result .+= bias.unsafeUnsqueeze(0)
+  result .+= bias.unsqueeze(0)
 
 proc conv2d_backward*[T: float32](input, kernel, bias: CudaTensor[T],
                          padding: SizeHW = [0,0],
@@ -97,7 +97,7 @@ proc conv2d_backward*[T: float32](input, kernel, bias: CudaTensor[T],
 
   # CuDNN requires grad_output to be C contiguous. (It is undocumented as of CuDNN v7)
   # If grad_output is F contiguous it throws CUDNN_STATUS_NOT_SUPPORTED in the algo procs.
-  let gOutput = grad_output.unsafeContiguous(rowMajor, force = true)
+  let gOutput = grad_output.asContiguous(rowMajor, force = true)
 
   let # TODO: Automatic destructor
     srcTensorDesc =        newCudnn4DTensorDesc   input
@@ -113,15 +113,15 @@ proc conv2d_backward*[T: float32](input, kernel, bias: CudaTensor[T],
 
   # Bias gradient
   if bias.rank > 0:
-    let gradBiasTensorDesc = newCudnn4DTensorDesc grad_bias.unsafeUnsqueeze(0)
+    let gradBiasTensorDesc = newCudnn4DTensorDesc grad_bias.unsqueeze(0)
     check cudnnConvolutionBackwardBias(
       defaultHandle_cudnn,
       addr alpha,
       gradOutputTensorDesc,
-      gOutput.data.data[],
+      gOutput.get_offset_ptr,
       addr beta,
       gradBiasTensorDesc,
-      grad_bias.data.data[]
+      grad_bias.get_offset_ptr
     )
 
     # TODO squeeze and divide by batch size?
@@ -143,16 +143,16 @@ proc conv2d_backward*[T: float32](input, kernel, bias: CudaTensor[T],
     defaultHandle_cudnn,
     addr alpha,
     srcTensorDesc,
-    input.data.data[],
+    input.get_offset_ptr,
     gradOutputTensorDesc,
-    gOutput.data.data[],
+    gOutput.get_offset_ptr,
     convDesc,
     kernel_algo_workspace.algo,
     kernel_algo_workspace.workspace[],
     kernel_algo_workspace.sizeInBytes,
     addr beta,
     gradKernelDesc,
-    grad_kernel.data.data[]
+    grad_kernel.get_offset_ptr
   )
 
   when defined(debug):
@@ -176,14 +176,14 @@ proc conv2d_backward*[T: float32](input, kernel, bias: CudaTensor[T],
     defaultHandle_cudnn,
     addr alpha,
     kernelDesc,
-    kernel.data.data[],
+    kernel.get_offset_ptr,
     gradOutputTensorDesc,
-    gOutput.data.data[],
+    gOutput.get_offset_ptr,
     convDesc,
     gradInput_algo_workspace.algo,
     gradInput_algo_workspace.workspace[],
     gradInput_algo_workspace.sizeInBytes,
     addr beta,
     gradInputTensorDesc,
-    grad_input.data.data[]
+    grad_input.get_offset_ptr
   )
diff --git a/src/nn_primitives/nnp_convolution.nim b/src/nn_primitives/nnp_convolution.nim
@@ -91,7 +91,7 @@ proc conv2d_backward*[T](input, weight, bias: Tensor[T],
   # Bias gradient
   if bias.rank > 0: # TODO make bias truly optional and not just a tensor of rank 0
     # TODO: sum over many axes
-    grad_bias = grad_output.sum(3).sum(2).sum(0).unsafeReshape(bias.shape)
+    grad_bias = grad_output.sum(3).sum(2).sum(0).reshape(bias.shape)
 
   case algorithm:
     of NNPackAuto:

diff --git a/src/nn_primitives/nnp_linear.nim b/src/nn_primitives/nnp_linear.nim
@@ -34,8 +34,8 @@ proc linear_backward*[T](
         cached_tensor,
         weight, bias: Tensor[T],
         dW, db: var Tensor[T]): Tensor[T] {.inline.} =
-  result = weight.unsafeTranspose * gradient
-  gemm(gradient, cached_tensor.unsafeTranspose, dW)
+  result = weight.transpose * gradient
+  gemm(gradient, cached_tensor.transpose, dW)
 
   db = sum(gradient, axis=0) # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html
 
@@ -44,6 +44,6 @@ proc linear_backward*[T](
         cached_tensor,
         weight: Tensor[T],
         dW: var Tensor[T]): Tensor[T] {.inline.} =
-  result = weight.unsafeTranspose * gradient
-  gemm(gradient, cached_tensor.unsafeTranspose, dW)
+  result = weight.transpose * gradient
+  gemm(gradient, cached_tensor.transpose, dW)
 
diff --git a/src/nn_primitives/nnp_softmax_cross_entropy.nim b/src/nn_primitives/nnp_softmax_cross_entropy.nim
@@ -97,12 +97,12 @@ proc sparse_softmax_cross_entropy*[T](input: Tensor[T], target: Tensor[int]): T
   # ∑i(- ti * yi) is either -yi or 0 in the sparse case.
   # Since target holds coordinates: ∑i(- ti * yi) = - yi[ti]
   for i in 0||(input.shape[1]-1):
-    let lse = input.unsafeSlice(_,i).logsumexp
+    let lse = input[_,i].logsumexp
 
     when not declared(openmp):
-      result += lse - input.unsafeSlice(target.unsafeSlice(i), i)
+      result += lse - input[target[i], i]
     else:
-      let tmp = lse - input.unsafeSlice(target.unsafeSlice(i), i)
+      let tmp = lse - input[target[i], i]
       {.emit:"#pragma omp atomic".}
       {.emit:"`result` += `tmp`;".}
 
@@ -140,7 +140,7 @@ proc softmax_cross_entropy_backward*[T](
   elif gradient is Tensor:
     let grad = gradient.data[gradient.offset]
 
-  let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 1).unsafeBroadcast(cached_tensor.shape)
+  let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 1).broadcast(cached_tensor.shape)
 
   result = map3_inline(cached_tensor, target, axis_max_sumexp):
       grad * (stable_softmax(x, z.max, z.sumexp) - y) / T(batch_size)
@@ -176,8 +176,8 @@ proc sparse_softmax_cross_entropy_backward*[T](
   for i, truth_idx in enumerate(target):
     result[truth_idx, i] = -1
 
-  let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 1).unsafeBroadcast(cached_tensor.shape)
-  # let axis_max_sumexp = cached_tensor.classic_max_sumexp(axis = 1).unsafeBroadcast(cached_tensor.shape)
+  let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 1).broadcast(cached_tensor.shape)
+  # let axis_max_sumexp = cached_tensor.classic_max_sumexp(axis = 1).broadcast(cached_tensor.shape)
 
 
   apply3_inline(result, cached_tensor, axis_max_sumexp):

diff --git a/src/nn_primitives/private/p_logsumexp.nim b/src/nn_primitives/private/p_logsumexp.nim
@@ -45,13 +45,13 @@ proc streaming_max_sumexp*[T](t: Tensor[T], axis: int): Tensor[tuple[max:T, sume
   result = newTensorUninit[tuple[max:T, sumexp: T]](t.shape[axis])
 
   for i in `||`(0, t.shape[axis]-1, "simd"):
-    result.data[i] = t.unsafeAtAxisIndex(axis, i).streaming_max_sumexp
+    result.data[i] = t.atAxisIndex(axis, i).streaming_max_sumexp
 
   # Reexpand the tensor to be consistent with fold_axis/reduce_axis
   if axis == 0:
-    result = result.unsafeUnsqueeze(1)
+    result = result.unsqueeze(1)
   else:
-    result = result.unsafeUnsqueeze(0)
+    result = result.unsqueeze(0)