mratsim · mratsim · Dec 12, 2020 · Dec 12, 2020 · Dec 12, 2020 · Dec 12, 2020
diff --git a/benchmarks/ex01_xor.nim b/benchmarks/ex01_xor.nim
@@ -1,50 +1,52 @@
 import ../src/arraymancer
 
 # Learning XOR function with a neural network.
-
-# Autograd context / neuralnet graph
-let ctx = newContext Tensor[float32]
-let bsz = 32 # batch size
-
-let x_train_bool = randomTensor([bsz * 100, 2], 1).astype(bool)
-let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]
-let x_train = ctx.variable(x_train_bool.astype(float32))
-let y = y_bool.astype(float32)
-
-# We will build the following network:
-# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
-
-let layer_3neurons = ctx.variable(
-                       randomTensor(3, 2, 2.0f) -. 1.0f,
-                       requires_grad = true
-                     )
-
-let classifier_layer = ctx.variable(
-                         randomTensor(1, 3, 2.0f) -. 1.0f,
-                         requires_grad = true
-                       )
-
-# Stochastic Gradient Descent
-let optim = newSGD[float32](
-    layer_3neurons, classifier_layer, 0.01f
-  )
-
-# Learning loop
-for epoch in 0..10000:
-  for batch_id in 0..<100:
-
-    # minibatch offset in the Tensor
-    let offset = batch_id * 32
-    let x = x_train[offset ..< offset + 32, _]
-    let target = y[offset ..< offset + 32, _]
-
-    # Building the network
-    let n1 = relu linear(x, layer_3neurons)
-    let n2 = linear(n1, classifier_layer)
-    let loss = n2.sigmoid_cross_entropy(target)
-
-    # Compute the gradient (i.e. contribution of each parameter to the loss)
-    loss.backprop()
-
-    # Correct the weights now that we have the gradient information
-    optim.update()
+proc main() =
+  # Autograd context / neuralnet graph
+  let ctx = newContext Tensor[float32]
+  let bsz = 32 # batch size
+
+  let x_train_bool = randomTensor([bsz * 100, 2], 1).astype(bool)
+  let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]
+  let x_train = ctx.variable(x_train_bool.astype(float32))
+  let y = y_bool.astype(float32)
+
+  # We will build the following network:
+  # Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
+
+  let layer_3neurons = ctx.variable(
+                        randomTensor(3, 2, 2.0f) -. 1.0f,
+                        requires_grad = true
+                      )
+
+  let classifier_layer = ctx.variable(
+                          randomTensor(1, 3, 2.0f) -. 1.0f,
+                          requires_grad = true
+                        )
+
+  # Stochastic Gradient Descent
+  let optim = newSGD[float32](
+      layer_3neurons, classifier_layer, 0.01f
+    )
+
+  # Learning loop
+  for epoch in 0..10000:
+    for batch_id in 0..<100:
+
+      # minibatch offset in the Tensor
+      let offset = batch_id * 32
+      let x = x_train[offset ..< offset + 32, _]
+      let target = y[offset ..< offset + 32, _]
+
+      # Building the network
+      let n1 = relu linear(x, layer_3neurons)
+      let n2 = linear(n1, classifier_layer)
+      let loss = n2.sigmoid_cross_entropy(target)
+
+      # Compute the gradient (i.e. contribution of each parameter to the loss)
+      loss.backprop()
+
+      # Correct the weights now that we have the gradient information
+      optim.update()
+
+main()
diff --git a/benchmarks/implementation/softmax_cross_entropy_bench_batch_first.nim b/benchmarks/implementation/softmax_cross_entropy_bench_batch_first.nim
@@ -176,6 +176,7 @@ proc softmax_cross_entropy_backward1[T](
 
   result = zeros_like(cached_tensor)
 
+  # TODO: nested parallelism has suspect performance
   for i in 0||(batch_size-1):
     let (max, sumexp) = cached_tensor[i,_].streaming_max_sumexp
 

diff --git a/benchmarks/implementation/softmax_cross_entropy_bench_batch_last.nim b/benchmarks/implementation/softmax_cross_entropy_bench_batch_last.nim
@@ -176,6 +176,7 @@ proc softmax_cross_entropy_backward1[T](
 
   result = zeros_like(cached_tensor)
 
+  # TODO: nested parallelism has suspect performance
   for i in 0||(batch_size-1): # Can't use OpenMP - SIGSEGV Illegal Address
     let (max, sumexp) = cached_tensor[_,i].streaming_max_sumexp
 

diff --git a/examples/ex06_shakespeare_generator.nim b/examples/ex06_shakespeare_generator.nim
@@ -56,7 +56,7 @@ const
 #
 # ################################################################
 
-func strToTensor(str: string|TaintedString): Tensor[PrintableIdx] =
+proc strToTensor(str: string|TaintedString): Tensor[PrintableIdx] =
   result = newTensor[PrintableIdx](str.len)
 
   # For each x in result, map the corresponding char index

diff --git a/src/arraymancer/io/io_image.nim b/src/arraymancer/io/io_image.nim
@@ -27,7 +27,7 @@ proc read_image*(filepath: string): Tensor[uint8] =
   ## Usage example with conversion to [0..1] float:
   ## .. code:: nim
   ##   let raw_img = read_image('path/to/image.png')
-  ##   let img = raw_img.map_inline:
+  ##   let img = forEach x in raw_img:
   ##     x.float32 / 255.0
 
   var width, height, channels: int

diff --git a/src/arraymancer/laser/strided_iteration/foreach_common.nim b/src/arraymancer/laser/strided_iteration/foreach_common.nim
@@ -4,14 +4,73 @@
 # This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  macros,
+  std/[macros, strutils],
   ../compiler_optim_hints
 
-template isVar[T: object](x: T): bool =
+template isVar[T](x: T): bool =
   ## Workaround due to `is` operator not working for `var`
   ## https://github.com/nim-lang/Nim/issues/9443
   compiles(addr(x))
 
+proc aliasTensor(id: int, tensor: NimNode): tuple[alias: NimNode, isVar: NimNode] =
+  ## Produce an alias variable for a tensor
+  ## Supports:
+  ## - identifiers
+  ## - dot call for field access
+  ## - foo[1] for array access
+  if tensor.kind notin {nnkIdent, nnkSym, nnkDotExpr, nnkBracketExpr, nnkCall}:
+    error "Expected a variable identifier or field access or array indexing but found \"" & tensor.repr() &
+      "\". Please assign to a variable first."
+
+  var t = tensor
+
+  # nnkCall handles the annoying typed AST we get in generics
+  # Call
+  #   OpenSymChoice
+  #     Sym "[]"
+  #     Sym "[]"
+  #     ...
+  #   DotExpr
+  #     Ident "self"
+  #     Ident "first_moments"
+  #   Ident "i"
+  if tensor.kind == nnkCall:
+    tensor[0].expectKind nnkOpenSymChoice
+    tensor[0][0].expectKind nnkSym
+    doAssert tensor[0][0].eqIdent("[]")
+
+    # Rewrite the AST to untyped
+    t = nnkBracketExpr.newTree(
+      tensor[1]
+    )
+    for i in 2 ..< tensor.len:
+      t.add tensor[i]
+
+  let isVar = block:
+    # Handle slicing cases like foo[0..<1, 0..<2]
+    # that do not return `var` but are technically `var`
+    # if `foo` is var
+    if t.kind in {nnkDotExpr, nnkBracketExpr}:
+      let t0 = t[0]
+      quote do: isVar(`t0`)
+    else:
+      quote do: isVar(`t`)
+
+  proc flattenedName(node: NimNode): string =
+    if node.kind in {nnkDotExpr, nnkBracketExpr}:
+      result = flattenedName(node[0])
+      result &= '_'
+      result &= flattenedName(node[1])
+    elif node.kind in {nnkIdent, nnkSym}:
+      result = $node
+    else:
+      error "Expected a field nameor dot expression or array access but found \"" &
+        t.repr()
+
+  let alias = flattenedName(t)
+
+  return (newIdentNode($alias & "_alias" & $id & '_'), isVar)
+
 proc initForEach*(
         params: NimNode,
         values, aliases, raw_ptrs: var NimNode,
@@ -24,7 +83,9 @@ proc initForEach*(
   var tensors = nnkBracket.newTree()
 
   template syntaxError() {.dirty.} =
-    error "Syntax error: argument " & ($arg.kind).substr(3) & " in position #" & $i & " was unexpected."
+    error "Syntax error: argument " &
+      ($arg.kind).substr(3) & " in position #" & $i & " was unexpected." &
+      "\n    " & arg.repr()
 
   for i, arg in params:
     if arg.kind == nnkInfix:
@@ -56,15 +117,15 @@ proc initForEach*(
   aliases_stmt.add newCall(bindSym"withCompilerOptimHints")
 
   for i, tensor in tensors:
-    let alias = newIdentNode($tensor & "_alias" & $i & '_')
+    let (alias, detectVar) = aliasTensor(i, tensor)
     aliases.add alias
     aliases_stmt.add quote do:
-      when isVar(`tensor`):
+      when `detectVar`:
         var `alias`{.align_variable.} = `tensor`
       else:
         let `alias`{.align_variable.} = `tensor`
 
-    let raw_ptr_i = genSym(nskLet, $tensor & "_raw_data" & $i & '_')
+    let raw_ptr_i = genSym(nskLet, $alias & "_raw_data" & $i & '_')
     raw_ptrs_stmt.add quote do:
       let `raw_ptr_i`{.restrict.} = `alias`.unsafe_raw_offset()
     raw_ptrs.add raw_ptr_i
@@ -74,7 +135,9 @@ proc initForEach*(
   for i in 1 ..< aliases.len:
     let alias_i = aliases[i]
     test_shapes.add quote do:
-      assert `alias0`.shape == `alias_i`.shape
+      assert `alias0`.shape == `alias_i`.shape,
+        "\n " & astToStr(`alias0`) & ".shape is " & $`alias0`.shape & "\n" &
+        "\n " & astToStr(`alias_i`) & ".shape is " & $`alias_i`.shape & "\n"
 
 template stridedVarsSetup*(): untyped {.dirty.} =
   for i, alias in aliases:

diff --git a/src/arraymancer/laser/tensor/initialization.nim b/src/arraymancer/laser/tensor/initialization.nim
@@ -136,14 +136,20 @@ proc copyFromRaw*[T](dst: var Tensor[T], buffer: ptr T, len: Natural) =
     withCompilerOptimHints()
     doAssert dst.size == len, "Tensor size and buffer length should be the same"
     let buf{.restrict.} = cast[ptr UncheckedArray[T]](buffer)
-    omp_parallel_chunks(
-            len, chunk_offset, chunk_size,
-            OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
-        copyMem(
-          dst.unsafe_raw_offset[chunk_offset].addr,
-          buf[chunk_offset].unsafeAddr,
-          chunk_size * sizeof(T)
-        )
+    # No OpenMP - unexplained overhead https://github.com/mratsim/Arraymancer/issues/485
+    # omp_parallel_chunks(
+    #         len, chunk_offset, chunk_size,
+    #         OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
+    #     copyMem(
+    #       dst.unsafe_raw_offset[chunk_offset].addr,
+    #       buf[chunk_offset].unsafeAddr,
+    #       chunk_size * sizeof(T)
+    #     )
+    copyMem(
+      dst.unsafe_raw_offset[0].addr,
+      buf[0].unsafeAddr,
+      len * sizeof(T)
+    )
   else:
     {.fatal: "Only non-ref types and types with trivial destructors can be raw copied.".}
 
@@ -166,14 +172,19 @@ proc setZero*[T](t: var Tensor[T], check_contiguous: static bool = true) =
   when not (T is KnownSupportsCopyMem):
     t.storage.raw_buffer.reset()
     t.storage.raw_buffer.setLen(t.size)
-  else:
-    omp_parallel_chunks(
-          t.size, chunk_offset, chunk_size,
-          OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
-      zeroMem(
-        t.unsafe_raw_offset[chunk_offset].addr,
-        chunk_size * sizeof(T)
-      )
+  else: # if setZero or newTensor are used in OpenMP parallel regions
+        # making this parallel will kill performance.
+    # omp_parallel_chunks(
+    #       t.size, chunk_offset, chunk_size,
+    #       OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
+    #   zeroMem(
+    #     t.unsafe_raw_offset[chunk_offset].addr,
+    #     chunk_size * sizeof(T)
+    #   )
+    zeroMem(
+      t.unsafe_raw_offset[0].addr,
+      t.size * sizeof(T)
+    )
 
 proc newTensor*[T](shape: varargs[int]): Tensor[T] =
   var size: int

diff --git a/src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim b/src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim
@@ -53,11 +53,13 @@ proc gelsd*[T: SomeFloat](
   var b2: Tensor[T]
   b2.newMatrixUninitColMajor(ldb.int, nrhs.int)
 
-  var b2_slice = b2[0 ..< b.shape[0], 0 ..< nrhs] # Workaround because slicing does no produce a var at the moment
-  apply2_inline(b2_slice, b):
-    # paste b in b2.
-    # if b2 is larger, the rest is zeros
-    y
+  var b2_slice = b2[0 ..< b.shape[0], 0 ..< nrhs]
+  if is1d:
+    b2_slice = b2_slice.squeeze(1)
+
+  forEach ib2 in b2_slice,
+          ib in b:
+    ib2 = ib
 
   # Temporary sizes
   const smlsiz = 25'i32 # equal to the maximum size of the subproblems at the bottom of the computation tree

diff --git a/src/arraymancer/ml/dimensionality_reduction/pca.nim b/src/arraymancer/ml/dimensionality_reduction/pca.nim
@@ -187,8 +187,10 @@ proc pca_detailed*[T: SomeFloat](
 
   # Variance explained by Singular Values
   let bessel_correction = T(result.n_observations - 1)
-  result.explained_variance = map_inline(S):
-    x * x / bessel_correction
+  result.explained_variance = newTensorUninit[T](S.shape)
+  forEach ev in result.explained_variance,
+          s in S:
+    ev = s*s / bessel_correction
 
   # Since we are using SVD truncated to `n_components` we need to
   # refer back to the original matrix for total variance

diff --git a/src/arraymancer/ml/metrics/common_error_functions.nim b/src/arraymancer/ml/metrics/common_error_functions.nim
@@ -22,7 +22,11 @@ proc squared_error*[T](y, y_true: T): T {.inline.} =
 
 proc squared_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} =
   ## Element-wise squared error for a tensor, |y_true - y| ^2
-  result = map2_inline(y_true, y, squared_error(x,y))
+  result = newTensorUninit[T](y.shape)
+  forEach r in result,
+          testval in y,
+          truthval in y_true:
+    r = squared_error(testval,truthval)
 
 proc mean_squared_error*[T](y, y_true: Tensor[T]): T =
   ## Also known as MSE or L2 loss, mean squared error between elements:
@@ -48,7 +52,11 @@ proc relative_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} =
   ## Normally the relative error is defined as |y_true - x| / |y_true|,
   ## but here max is used to make it symmetric and to prevent dividing by zero,
   ## guaranteed to return zero in the case when both values are zero.
-  result = map2_inline(y, y_true, relative_error(x,y))
+  result = newTensorUninit[T](y.shape)
+  forEach r in result,
+          testval in y,
+          truthval in y_true:
+    r = relative_error(testval,truthval)
 
 proc mean_relative_error*[T](y, y_true: Tensor[T]): T =
   ## Mean relative error for Tensor, mean of the element-wise
@@ -67,10 +75,14 @@ proc absolute_error*[T: SomeFloat](y, y_true: T): T {.inline.} =
 
 proc absolute_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} =
   ## Element-wise absolute error for a tensor
-  result = map2_inline(y, y_true, y.absolute_error(x))
+  result = newTensorUninit[T](y.shape)
+  forEach r in result,
+          testval in y,
+          truthval in y_true:
+    r = absolute_error(testval,truthval)
 
 proc mean_absolute_error*[T](y, y_true: Tensor[T]): T =
   ## Also known as L1 loss, absolute error between elements:
   ## sum(|y_true - y|)/m
   ## where m is the number of elements
-  result = map2_inline(y, y_true, y.absolute_error(x)).mean()
+  result = y.absolute_error(y_true).mean()
diff --git a/src/arraymancer/nn/loss/mean_square_error_loss.nim b/src/arraymancer/nn/loss/mean_square_error_loss.nim
@@ -30,8 +30,10 @@ proc mse_backward_ag[TT](self: MSELoss[TT], payload: Payload[TT]): SmallDiffs[TT
                                                   # See also Stanford course: http://theory.stanford.edu/~tim/s15/l/l15.pdf
 
   result = newDiffs[TT](1)
-  result[0] = map2_inline(self.cache.value, self.target):
-    norm * (x - y)
+  forEach r0 in result[0],
+          v in self.cache.value,
+          t in self.target:
+    r0 = norm * (v - t)
 
 proc mse_cache[TT](result: Variable[TT], input: Variable[TT], target: TT) =
   ## We expect input with shape [batch_size, features]