Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Laser forEach iterators #479

Draft
wants to merge 16 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 49 additions & 47 deletions benchmarks/ex01_xor.nim
Original file line number Diff line number Diff line change
@@ -1,50 +1,52 @@
import ../src/arraymancer

# Learning XOR function with a neural network.

# Autograd context / neuralnet graph
let ctx = newContext Tensor[float32]
let bsz = 32 # batch size

let x_train_bool = randomTensor([bsz * 100, 2], 1).astype(bool)
let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]
let x_train = ctx.variable(x_train_bool.astype(float32))
let y = y_bool.astype(float32)

# We will build the following network:
# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss

let layer_3neurons = ctx.variable(
randomTensor(3, 2, 2.0f) -. 1.0f,
requires_grad = true
)

let classifier_layer = ctx.variable(
randomTensor(1, 3, 2.0f) -. 1.0f,
requires_grad = true
)

# Stochastic Gradient Descent
let optim = newSGD[float32](
layer_3neurons, classifier_layer, 0.01f
)

# Learning loop
for epoch in 0..10000:
for batch_id in 0..<100:

# minibatch offset in the Tensor
let offset = batch_id * 32
let x = x_train[offset ..< offset + 32, _]
let target = y[offset ..< offset + 32, _]

# Building the network
let n1 = relu linear(x, layer_3neurons)
let n2 = linear(n1, classifier_layer)
let loss = n2.sigmoid_cross_entropy(target)

# Compute the gradient (i.e. contribution of each parameter to the loss)
loss.backprop()

# Correct the weights now that we have the gradient information
optim.update()
proc main() =
# Autograd context / neuralnet graph
let ctx = newContext Tensor[float32]
let bsz = 32 # batch size

let x_train_bool = randomTensor([bsz * 100, 2], 1).astype(bool)
let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]
let x_train = ctx.variable(x_train_bool.astype(float32))
let y = y_bool.astype(float32)

# We will build the following network:
# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss

let layer_3neurons = ctx.variable(
randomTensor(3, 2, 2.0f) -. 1.0f,
requires_grad = true
)

let classifier_layer = ctx.variable(
randomTensor(1, 3, 2.0f) -. 1.0f,
requires_grad = true
)

# Stochastic Gradient Descent
let optim = newSGD[float32](
layer_3neurons, classifier_layer, 0.01f
)

# Learning loop
for epoch in 0..10000:
for batch_id in 0..<100:

# minibatch offset in the Tensor
let offset = batch_id * 32
let x = x_train[offset ..< offset + 32, _]
let target = y[offset ..< offset + 32, _]

# Building the network
let n1 = relu linear(x, layer_3neurons)
let n2 = linear(n1, classifier_layer)
let loss = n2.sigmoid_cross_entropy(target)

# Compute the gradient (i.e. contribution of each parameter to the loss)
loss.backprop()

# Correct the weights now that we have the gradient information
optim.update()

main()
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ proc softmax_cross_entropy_backward1[T](

result = zeros_like(cached_tensor)

# TODO: nested parallelism has suspect performance
for i in 0||(batch_size-1):
let (max, sumexp) = cached_tensor[i,_].streaming_max_sumexp

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ proc softmax_cross_entropy_backward1[T](

result = zeros_like(cached_tensor)

# TODO: nested parallelism has suspect performance
for i in 0||(batch_size-1): # Can't use OpenMP - SIGSEGV Illegal Address
let (max, sumexp) = cached_tensor[_,i].streaming_max_sumexp

Expand Down
2 changes: 1 addition & 1 deletion examples/ex06_shakespeare_generator.nim
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ const
#
# ################################################################

func strToTensor(str: string|TaintedString): Tensor[PrintableIdx] =
proc strToTensor(str: string|TaintedString): Tensor[PrintableIdx] =
result = newTensor[PrintableIdx](str.len)

# For each x in result, map the corresponding char index
Expand Down
2 changes: 1 addition & 1 deletion src/arraymancer/io/io_image.nim
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ proc read_image*(filepath: string): Tensor[uint8] =
## Usage example with conversion to [0..1] float:
## .. code:: nim
## let raw_img = read_image('path/to/image.png')
## let img = raw_img.map_inline:
## let img = forEach x in raw_img:
## x.float32 / 255.0

var width, height, channels: int
Expand Down
77 changes: 70 additions & 7 deletions src/arraymancer/laser/strided_iteration/foreach_common.nim
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,73 @@
# This file may not be copied, modified, or distributed except according to those terms.

import
macros,
std/[macros, strutils],
../compiler_optim_hints

template isVar[T: object](x: T): bool =
template isVar[T](x: T): bool =
## Workaround due to `is` operator not working for `var`
## https://github.com/nim-lang/Nim/issues/9443
compiles(addr(x))

proc aliasTensor(id: int, tensor: NimNode): tuple[alias: NimNode, isVar: NimNode] =
## Produce an alias variable for a tensor
## Supports:
## - identifiers
## - dot call for field access
## - foo[1] for array access
if tensor.kind notin {nnkIdent, nnkSym, nnkDotExpr, nnkBracketExpr, nnkCall}:
error "Expected a variable identifier or field access or array indexing but found \"" & tensor.repr() &
"\". Please assign to a variable first."

var t = tensor

# nnkCall handles the annoying typed AST we get in generics
# Call
# OpenSymChoice
# Sym "[]"
# Sym "[]"
# ...
# DotExpr
# Ident "self"
# Ident "first_moments"
# Ident "i"
if tensor.kind == nnkCall:
tensor[0].expectKind nnkOpenSymChoice
tensor[0][0].expectKind nnkSym
doAssert tensor[0][0].eqIdent("[]")

# Rewrite the AST to untyped
t = nnkBracketExpr.newTree(
tensor[1]
)
for i in 2 ..< tensor.len:
t.add tensor[i]

let isVar = block:
# Handle slicing cases like foo[0..<1, 0..<2]
# that do not return `var` but are technically `var`
# if `foo` is var
if t.kind in {nnkDotExpr, nnkBracketExpr}:
let t0 = t[0]
quote do: isVar(`t0`)
else:
quote do: isVar(`t`)

proc flattenedName(node: NimNode): string =
if node.kind in {nnkDotExpr, nnkBracketExpr}:
result = flattenedName(node[0])
result &= '_'
result &= flattenedName(node[1])
elif node.kind in {nnkIdent, nnkSym}:
result = $node
else:
error "Expected a field nameor dot expression or array access but found \"" &
t.repr()

let alias = flattenedName(t)

return (newIdentNode($alias & "_alias" & $id & '_'), isVar)

proc initForEach*(
params: NimNode,
values, aliases, raw_ptrs: var NimNode,
Expand All @@ -24,7 +83,9 @@ proc initForEach*(
var tensors = nnkBracket.newTree()

template syntaxError() {.dirty.} =
error "Syntax error: argument " & ($arg.kind).substr(3) & " in position #" & $i & " was unexpected."
error "Syntax error: argument " &
($arg.kind).substr(3) & " in position #" & $i & " was unexpected." &
"\n " & arg.repr()

for i, arg in params:
if arg.kind == nnkInfix:
Expand Down Expand Up @@ -56,15 +117,15 @@ proc initForEach*(
aliases_stmt.add newCall(bindSym"withCompilerOptimHints")

for i, tensor in tensors:
let alias = newIdentNode($tensor & "_alias" & $i & '_')
let (alias, detectVar) = aliasTensor(i, tensor)
aliases.add alias
aliases_stmt.add quote do:
when isVar(`tensor`):
when `detectVar`:
var `alias`{.align_variable.} = `tensor`
else:
let `alias`{.align_variable.} = `tensor`

let raw_ptr_i = genSym(nskLet, $tensor & "_raw_data" & $i & '_')
let raw_ptr_i = genSym(nskLet, $alias & "_raw_data" & $i & '_')
raw_ptrs_stmt.add quote do:
let `raw_ptr_i`{.restrict.} = `alias`.unsafe_raw_offset()
raw_ptrs.add raw_ptr_i
Expand All @@ -74,7 +135,9 @@ proc initForEach*(
for i in 1 ..< aliases.len:
let alias_i = aliases[i]
test_shapes.add quote do:
assert `alias0`.shape == `alias_i`.shape
assert `alias0`.shape == `alias_i`.shape,
"\n " & astToStr(`alias0`) & ".shape is " & $`alias0`.shape & "\n" &
"\n " & astToStr(`alias_i`) & ".shape is " & $`alias_i`.shape & "\n"

template stridedVarsSetup*(): untyped {.dirty.} =
for i, alias in aliases:
Expand Down
43 changes: 27 additions & 16 deletions src/arraymancer/laser/tensor/initialization.nim
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,20 @@ proc copyFromRaw*[T](dst: var Tensor[T], buffer: ptr T, len: Natural) =
withCompilerOptimHints()
doAssert dst.size == len, "Tensor size and buffer length should be the same"
let buf{.restrict.} = cast[ptr UncheckedArray[T]](buffer)
omp_parallel_chunks(
len, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
copyMem(
dst.unsafe_raw_offset[chunk_offset].addr,
buf[chunk_offset].unsafeAddr,
chunk_size * sizeof(T)
)
# No OpenMP - unexplained overhead https://github.com/mratsim/Arraymancer/issues/485
# omp_parallel_chunks(
# len, chunk_offset, chunk_size,
# OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
# copyMem(
# dst.unsafe_raw_offset[chunk_offset].addr,
# buf[chunk_offset].unsafeAddr,
# chunk_size * sizeof(T)
# )
copyMem(
dst.unsafe_raw_offset[0].addr,
buf[0].unsafeAddr,
len * sizeof(T)
)
else:
{.fatal: "Only non-ref types and types with trivial destructors can be raw copied.".}

Expand All @@ -166,14 +172,19 @@ proc setZero*[T](t: var Tensor[T], check_contiguous: static bool = true) =
when not (T is KnownSupportsCopyMem):
t.storage.raw_buffer.reset()
t.storage.raw_buffer.setLen(t.size)
else:
omp_parallel_chunks(
t.size, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
zeroMem(
t.unsafe_raw_offset[chunk_offset].addr,
chunk_size * sizeof(T)
)
else: # if setZero or newTensor are used in OpenMP parallel regions
# making this parallel will kill performance.
# omp_parallel_chunks(
# t.size, chunk_offset, chunk_size,
# OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
# zeroMem(
# t.unsafe_raw_offset[chunk_offset].addr,
# chunk_size * sizeof(T)
# )
zeroMem(
t.unsafe_raw_offset[0].addr,
t.size * sizeof(T)
)

proc newTensor*[T](shape: varargs[int]): Tensor[T] =
var size: int
Expand Down
12 changes: 7 additions & 5 deletions src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,13 @@ proc gelsd*[T: SomeFloat](
var b2: Tensor[T]
b2.newMatrixUninitColMajor(ldb.int, nrhs.int)

var b2_slice = b2[0 ..< b.shape[0], 0 ..< nrhs] # Workaround because slicing does no produce a var at the moment
apply2_inline(b2_slice, b):
# paste b in b2.
# if b2 is larger, the rest is zeros
y
var b2_slice = b2[0 ..< b.shape[0], 0 ..< nrhs]
if is1d:
b2_slice = b2_slice.squeeze(1)

forEach ib2 in b2_slice,
ib in b:
ib2 = ib

# Temporary sizes
const smlsiz = 25'i32 # equal to the maximum size of the subproblems at the bottom of the computation tree
Expand Down
6 changes: 4 additions & 2 deletions src/arraymancer/ml/dimensionality_reduction/pca.nim
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,10 @@ proc pca_detailed*[T: SomeFloat](

# Variance explained by Singular Values
let bessel_correction = T(result.n_observations - 1)
result.explained_variance = map_inline(S):
x * x / bessel_correction
result.explained_variance = newTensorUninit[T](S.shape)
forEach ev in result.explained_variance,
s in S:
ev = s*s / bessel_correction

# Since we are using SVD truncated to `n_components` we need to
# refer back to the original matrix for total variance
Expand Down
20 changes: 16 additions & 4 deletions src/arraymancer/ml/metrics/common_error_functions.nim
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ proc squared_error*[T](y, y_true: T): T {.inline.} =

proc squared_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} =
## Element-wise squared error for a tensor, |y_true - y| ^2
result = map2_inline(y_true, y, squared_error(x,y))
result = newTensorUninit[T](y.shape)
forEach r in result,
testval in y,
truthval in y_true:
r = squared_error(testval,truthval)

proc mean_squared_error*[T](y, y_true: Tensor[T]): T =
## Also known as MSE or L2 loss, mean squared error between elements:
Expand All @@ -48,7 +52,11 @@ proc relative_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} =
## Normally the relative error is defined as |y_true - x| / |y_true|,
## but here max is used to make it symmetric and to prevent dividing by zero,
## guaranteed to return zero in the case when both values are zero.
result = map2_inline(y, y_true, relative_error(x,y))
result = newTensorUninit[T](y.shape)
forEach r in result,
testval in y,
truthval in y_true:
r = relative_error(testval,truthval)

proc mean_relative_error*[T](y, y_true: Tensor[T]): T =
## Mean relative error for Tensor, mean of the element-wise
Expand All @@ -67,10 +75,14 @@ proc absolute_error*[T: SomeFloat](y, y_true: T): T {.inline.} =

proc absolute_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} =
## Element-wise absolute error for a tensor
result = map2_inline(y, y_true, y.absolute_error(x))
result = newTensorUninit[T](y.shape)
forEach r in result,
testval in y,
truthval in y_true:
r = absolute_error(testval,truthval)

proc mean_absolute_error*[T](y, y_true: Tensor[T]): T =
## Also known as L1 loss, absolute error between elements:
## sum(|y_true - y|)/m
## where m is the number of elements
result = map2_inline(y, y_true, y.absolute_error(x)).mean()
result = y.absolute_error(y_true).mean()
6 changes: 4 additions & 2 deletions src/arraymancer/nn/loss/mean_square_error_loss.nim
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ proc mse_backward_ag[TT](self: MSELoss[TT], payload: Payload[TT]): SmallDiffs[TT
# See also Stanford course: http://theory.stanford.edu/~tim/s15/l/l15.pdf

result = newDiffs[TT](1)
result[0] = map2_inline(self.cache.value, self.target):
norm * (x - y)
forEach r0 in result[0],
v in self.cache.value,
t in self.target:
r0 = norm * (v - t)

proc mse_cache[TT](result: Variable[TT], input: Variable[TT], target: TT) =
## We expect input with shape [batch_size, features]
Expand Down
Loading