Linear regression (fixes #208) (#211)

* Add lapack support * Wrapper done. TODO tests * Least square working for 1d, not for 2d * Fix residual, the first value wasn't squared * Least squares solver fully implemented 🎆
mratsim · Apr 1, 2018 · bb8ad5c · bb8ad5c
1 parent 068075d
commit bb8ad5c
Show file tree

Hide file tree

Showing 13 changed files with 317 additions and 8 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -29,12 +29,13 @@ install:
   - SET PATH=%CD%\tools_tmp\%NIM_DIR%\bin;%CD%\tools_tmp\%MINGW_DIR%\bin;%PATH%
   - ps: nuget install OpenBLAS -o "${env:APPVEYOR_BUILD_FOLDER}"
   - ps: cp OpenBLAS.0.2.14.1/lib/native/bin/x64/libopenblas.dll blas.dll
+  # - ps: Start-FileDownload http://icl.cs.utk.edu/lapack-for-windows/libraries/VisualStudio/3.7.0/Dynamic-MINGW/Win64/liblapack.dll lapack.dll # This does not work, why?
   - SET PATH=%PATH%;%CD%
 
 build_script:
   - nimble.exe refresh
 
 test_script:
-  - nimble.exe test
+  - nimble.exe test_no_lapack
 
 deploy: off
diff --git a/.travis.yml b/.travis.yml
@@ -47,6 +47,7 @@ addons:
     packages:
       # On Linux we need OpenBLAS, on OSX Apple Accelerate is present by default
       - libopenblas-dev
+      - liblapack-dev
 
 before_install:
   # On MacOS flame/blis can be tested as it is an homebrew package

diff --git a/arraymancer.nimble b/arraymancer.nimble
@@ -5,7 +5,7 @@ description   = "A n-dimensional tensor (ndarray) library"
 license       = "Apache License 2.0"
 
 ### Dependencies
-requires "nim >= 0.18.0", "nimblas >= 0.1.3", "nimcuda >= 0.1.4", "nimcl >= 0.1.2", "clblast"
+requires "nim >= 0.18.0", "nimblas >= 0.1.3", "nimlapack >= 0.1.1", "nimcuda >= 0.1.4", "nimcl >= 0.1.2", "clblast"
 
 ## Install files
 srcDir = "src"
@@ -54,6 +54,7 @@ template mkl_threadedSwitches() =
   switch("define","openmp")
   switch("stackTrace","off")
   switch("define","blas=mkl_intel_lp64")
+  switch("define","lapack=mkl_intel_lp64")
   switch("clibdir", "/opt/intel/mkl/lib/intel64")
   switch("passl", "/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a")
   switch("passl", "-lmkl_core")
@@ -63,6 +64,7 @@ template mkl_threadedSwitches() =
 
 template mkl_singleSwitches() =
   switch("define","blas=mkl_intel_lp64")
+  switch("define","lapack=mkl_intel_lp64")
   switch("clibdir", "/opt/intel/mkl/lib/intel64")
   switch("passl", "/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a")
   switch("passl", "-lmkl_core")
@@ -110,7 +112,11 @@ task all_tests, "Run all tests - Intel MKL + OpenMP + Cuda + march=native + rele
   cuda_mkl_openmp
   test "full_test_suite", "cpp"
 
-task test, "Run all tests - Default BLAS":
+task test, "Run all tests - Default BLAS & Lapack":
+  test "tests_cpu"
+
+task test_no_lapack, "Run all tests - Default BLAS without lapack":
+  switch("define", "no_lapack")
   test "tests_cpu"
 
 task test_cpp, "Run all tests - Cpp codegen":
@@ -133,6 +139,7 @@ task test_opencl, "Run all OpenCL backend tests":
 
 task test_openblas, "Run all tests - OpenBLAS":
   switch("define","blas=openblas")
+  switch("define","lapack=openblas")
   when defined(macosx):
     ## Should work but somehow Nim doesn't find libopenblas.dylib on MacOS
     switch("clibdir", "/usr/local/opt/openblas/lib")

diff --git a/nim.cfg b/nim.cfg
@@ -1,6 +1,6 @@
 # Arraymancer compilation flag config
 
-####From default nim.cfg, somehow it's not taken into account with a custom nim.cfg
+#### From default nim.cfg, somehow it's not taken into account with a custom nim.cfg
 @if release or quick:
   obj_checks:off
   field_checks:off
@@ -57,6 +57,7 @@ clang.options.size = "-Os"
 
 @if openblas:
   define:"blas=openblas" # For nimblas
+  define:"blas=openblas" # For nimlapack
   @if macosx:
     clibdir:"/usr/local/opt/openblas/lib"
     cincludes:"/usr/local/opt/openblas/include"
@@ -71,6 +72,7 @@ clang.options.size = "-Os"
 @if mkl: # MKL multi_threaded
   define:"openmp"
   define:"blas=mkl_intel_lp64"
+  define:"lapack=mkl_intel_lp64"
   clibdir:"/opt/intel/mkl/lib/intel64"
   passl:"/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a"
   passl:"-lmkl_core"
@@ -87,4 +89,4 @@ clang.options.size = "-Os"
     gcc.exe:"/usr/local/bin/gcc-7"
     gcc.linkerexe:"/usr/local/bin/gcc-7"
   @end
-@end
+@end
diff --git a/src/arraymancer.nim b/src/arraymancer.nim
@@ -43,3 +43,7 @@ export  tensor,
         mnist,
         io_csv,
         ml
+
+when not defined(no_lapack):
+  import ./linear_algebra/linear_algebra
+  export linear_algebra
diff --git a/src/linear_algebra/least_squares.nim b/src/linear_algebra/least_squares.nim
@@ -0,0 +1,136 @@
+# Copyright (c) 2018 the Arraymancer contributors
+# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
+# This file may not be copied, modified, or distributed except according to those terms.
+
+import  ../tensor/tensor, ../tensor/backend/metadataArray,
+        ../tensor/private/p_init_cpu,
+        nimlapack, fenv, math
+
+
+proc gelsd(m: ptr cint; n: ptr cint; nrhs: ptr cint; a: ptr cfloat; lda: ptr cint;
+            b: ptr cfloat; ldb: ptr cint; s: ptr cfloat; rcond: ptr cfloat; rank: ptr cint;
+            work: ptr cfloat; lwork: ptr cint; iwork: ptr cint; info: ptr cint) {.inline.}=
+  sgelsd(
+    m, n, nrhs,
+    a, m,
+    b, ldb,
+    s, rcond, rank,
+    work, lwork,
+    iwork, info
+  )
+
+proc gelsd(m: ptr cint; n: ptr cint; nrhs: ptr cint; a: ptr cdouble; lda: ptr cint;
+            b: ptr cdouble; ldb: ptr cint; s: ptr cdouble; rcond: ptr cdouble;
+            rank: ptr cint; work: ptr cdouble; lwork: ptr cint; iwork: ptr cint;
+            info: ptr cint) {.inline.}=
+  dgelsd(
+    m, n, nrhs,
+    a, m,
+    b, ldb,
+    s, rcond, rank,
+    work, lwork,
+    iwork, info
+  )
+
+
+proc least_squares_solver*[T: SOmeReal](a, b: Tensor[T]):
+  tuple[
+    least_square_sol: Tensor[T],
+    residuals:  Tensor[T],
+    matrix_rank: int,
+    singular_values: Tensor[T]
+  ] {.noInit.}=
+
+  assert a.rank == 2 and b.rank in {1, 2} and a.shape[0] > 0 and a.shape[0] == b.shape[0]
+  # We need to copy the input as Lapack will destroy A and replace B with its result.
+  # Furthermore it expects a column major ordering.
+  # In the future with move operator we can consider letting moved tensors be destroyed.
+
+  # Note on dealing with row-major without copies:
+  # http://drsfenner.org/blog/2016/01/into-the-weeds-iii-hacking-lapack-calls-to-save-memory/
+
+  # LAPACK reference doc:
+  # http://www.netlib.org/lapack/explore-html/d7/d3b/group__double_g_esolve_ga94bd4a63a6dacf523e25ff617719f752.html
+  # Intel example for the Fortran API
+  # https://software.intel.com/sites/products/documentation/doclib/mkl_sa/11/mkl_lapack_examples/dgelsd_ex.c.htm
+
+  let is1d = b.rank == 1 # Axis 1 will be squeezed at the end for the 1d case
+
+  var # Parameters
+    m = a.shape[0].cint
+    n = a.shape[1].cint
+    nrhs = if is1d: 1.cint
+           else: b.shape[1].cint
+    # A is row-major so lda = m
+    ldb = max(m, n).cint
+
+  # Shadowing the inputs
+  var a = a.clone(colMajor) # Lapack destroys the A matrix during solving
+
+  # Lapack replaces the B matrix by the result during solving
+  # Furthermore, as we have input B shape M x NRHS and output N x NRHS
+  # if M < N we must zero the reminder of the tensor
+  var bstar: Tensor[T]
+  tensorCpu([ldb.int, nrhs.int], bstar, colMajor)
+  bstar.storage.Fdata = newSeq[T](bstar.size)
+
+  var bstar_slice = bstar[0 ..< b.shape[0], 0 ..< nrhs] # Workaround because slicing does no produce a var at the moment
+  apply2_inline(bstar_slice, b):
+    # paste b in bstar.
+    # if bstar is larger, the rest is zeros
+    y
+
+  # Temporary sizes
+  const smlsiz = 25.cint # equal to the maximum size of the subproblems at the bottom of the computation tree
+  let
+    minmn = min(m,n).int
+    nlvl = max(0, (minmn.T / (smlsiz+1).T).ln.cint + 1)
+    # Bug in some Lapack, liwork must be determined manually: http://icl.cs.utk.edu/lapack-forum/archives/lapack/msg00899.html
+    liwork = max(1, 3 * minmn * nlvl + 11 * minmn)
+
+  result.singular_values = newTensorUninit[T](minmn) # will hold the singular values of A
+
+  var # Temporary parameter values
+    # Condition for a float to be considered 0
+    rcond = epsilon(T) * a.shape.max.T * a.max
+    lwork = max(1, 12 * m + 2 * m * smlsiz + 8 * m * nlvl + m * nrhs + (smlsiz + 1) ^ 2)
+    work = newSeqUninitialized[T](lwork)
+    iwork = newSeqUninitialized[cint](liwork)
+    info, rank: cint
+
+  # Solve the equations A*X = B
+  gelsd(
+    m.addr, n.addr, nrhs.addr,
+    a.get_data_ptr, m.addr, # lda
+    bstar.get_data_ptr, ldb.addr,
+    result.singular_values[0].addr, rcond.addr, rank.addr,
+    work[0].addr, lwork.addr,
+    iwork[0].addr, info.addr
+  )
+
+  if info > 0:
+    # TODO, this should not be an exception, not converging is something that can happen and should
+    # not interrupt the program. Alternative. Fill the result with Inf?
+    raise newException(ValueError, "the algorithm for computing the SVD failed to converge")
+
+  if info < 0:
+    raise newException(ValueError, "Illegal parameter in linear square solver gelsd: " & $(-info))
+
+  result.matrix_rank = rank.int # This is the matrix_rank not the tensor rank
+  result.least_square_sol = bstar[0 ..< n, _].squeeze(axis = 1) # Correction for 1d case
+
+  if rank == n and m > n:
+    result.residuals = (bstar[n .. _, _].squeeze(axis = 1)).fold_axis_inline(Tensor[T], fold_axis = 0):
+      x = y .^ 2f # initial value
+    do:
+      x += y .^ 2f # core loop
+    do:
+      x += y # values were stored in a temporary array of size == nb of cores
+              # to avoid multithreading issues and must be reduced one last time
+  else:
+    result.residuals = newTensorUninit[T](0)
+    # Workaround as we can't create empty tensors for now:
+    result.residuals.shape[0] = 0
+    result.residuals.shape.len = 0
+    result.residuals.strides[0] = 0
+    result.residuals.shape.len = 0
diff --git a/src/linear_algebra/linear_algebra.nim b/src/linear_algebra/linear_algebra.nim
@@ -0,0 +1,7 @@
+# Copyright (c) 2018 the Arraymancer contributors
+# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
+# This file may not be copied, modified, or distributed except according to those terms.
+
+import ./least_squares
+
+export least_squares
diff --git a/src/tensor/backend/metadataArray.nim b/src/tensor/backend/metadataArray.nim
@@ -226,3 +226,7 @@ proc concat*[T](dsas: varargs[DynamicStackArray[T]]): DynamicStackArray[T] =
     for val in dsa:
       result[i] = val
       inc(i)
+
+proc max*[T](a: DynamicStackArray[T]): T {.noSideEffect, inline.} =
+  for val in a:
+    result = max(result, val)
diff --git a/src/tensor/init_copy_cpu.nim b/src/tensor/init_copy_cpu.nim
@@ -13,15 +13,16 @@
 # limitations under the License.
 
 import  ./data_structure,
-        ./higher_order_applymap
+        ./higher_order_applymap,
+        ./private/p_shapeshifting
 
 # Unfortunately higher_order depends on init_cpu and "clone" depends on higher_order, so we need an extra file
 # to deal with circular dependencies
 
-proc clone*[T](t: Tensor[T]): Tensor[T] {.noInit.}=
+proc clone*[T](t: Tensor[T], layout: OrderType = rowMajor): Tensor[T] {.noInit.}=
   ## Input:
   ##     - A tensor
   ## Returns:
   ##     - A full copy. The copy is recreated as a contiguous tensor.
   ##       If the input was a slice, unused values are discarded.
-  result = t.map_inline(x)
+  contiguousImpl(t, layout, result)
diff --git a/tests/linear_algebra/test_linear_algebra.nim b/tests/linear_algebra/test_linear_algebra.nim
@@ -0,0 +1,80 @@
+# Copyright (c) 2018 the Arraymancer contributors
+# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
+
+import ../../src/arraymancer
+import unittest, math, fenv
+
+suite "Linear algebra":
+  test "Linear equation solver using least squares":
+    block: # "Single equation"
+           # Example from Numpy documentation
+      let A = [
+        [0f, 1f],
+        [1f, 1f],
+        [2f, 1f],
+        [3f, 1f]
+      ].toTensor
+
+      let y = [-1f, 0.2, 0.9, 2.1].toTensor
+
+      let (solution, residuals, matrix_rank, singular_values) = least_squares_solver(A, y)
+
+      # From Numpy with double precision and `lstsq` function
+      let
+        expected_sol = [1f, -0.95f].toTensor
+        expected_residuals = [0.05f].toTensor
+        expected_matrix_rank = 2
+        expected_sv = [ 4.10003045f,  1.09075677].toTensor
+
+      check:
+        mean_relative_error(solution, expected_sol) < 1e-6
+        mean_relative_error(residuals, expected_residuals) < 1e-6
+        matrix_rank == expected_matrix_rank
+        mean_relative_error(singular_values, expected_sv) < 1e-6
+
+    block: # Example from Eigen
+           # https://eigen.tuxfamily.org/dox/group__LeastSquares.html
+      let A = [[ 0.68 ,  0.597],
+               [-0.211,  0.823],
+               [ 0.566, -0.605]].toTensor
+      let b = [-0.33, 0.536, -0.444].toTensor
+
+      let (solution, _, _, _) = least_squares_solver(A, b)
+      let expected_sol = [-0.67, 0.314].toTensor
+
+      check: mean_relative_error(solution, expected_sol) < 1e-3
+
+    block: # "Multiple independant equations"
+           # Example from Intel documentation:
+           # https://software.intel.com/sites/products/documentation/doclib/mkl_sa/11/mkl_lapack_examples/dgelsd_ex.c.htm
+      let a = [
+        [ 0.12,  -8.19,   7.69,  -2.26,  -4.71],
+        [-6.91,   2.22,  -5.12,  -9.08,   9.96],
+        [-3.33,  -8.94,  -6.72,  -4.40,  -9.98],
+        [ 3.97,   3.33,  -2.74,  -7.92,  -3.20]
+      ].toTensor
+
+      let b = [
+        [ 7.30,   0.47,  -6.28],
+        [ 1.33,   6.58,  -3.42],
+        [ 2.68,  -1.71,   3.46],
+        [-9.62,  -0.79,   0.41]
+      ].toTensor
+
+      let (solution, residuals, matrix_rank, singular_values) = least_squares_solver(a, b)
+
+      # From Intel minimum norm solution
+      let expected_sol = [[-0.69, -0.24,  0.06],
+                          [-0.80, -0.08,  0.21],
+                          [ 0.38,  0.12, -0.65],
+                          [ 0.29, -0.24,  0.42],
+                          [ 0.29,  0.35, -0.30]].toTensor
+      # No residuals
+      let expected_matrix_rank = 4
+      let expected_sv = [ 18.66, 15.99, 10.01, 8.51].toTensor
+
+      check:
+        mean_relative_error(solution, expected_sol) < 0.015
+        residuals.rank == 0 and residuals.shape[0] == 0 and residuals.strides[0] == 0
+        matrix_rank == expected_matrix_rank
+        mean_relative_error(singular_values, expected_sv) < 1e-03
diff --git a/tests/manual_checks/least_squares.nim b/tests/manual_checks/least_squares.nim
@@ -0,0 +1,33 @@
+
+import ../../src/arraymancer
+
+block:
+  let a = [
+    [ 0.12f,  -8.19,   7.69,  -2.26,  -4.71],
+    [-6.91f,   2.22,  -5.12,  -9.08,   9.96],
+    [-3.33f,  -8.94,  -6.72,  -4.40,  -9.98],
+    [ 3.97f,   3.33,  -2.74,  -7.92,  -3.20]
+  ].toTensor
+
+  let b = [
+    [ 7.30f,   0.47f,  -6.28f],
+    [ 1.33f,   6.58f,  -3.42f],
+    [ 2.68f,  -1.71f,   3.46f],
+    [-9.62f,  -0.79f,   0.41f]
+  ].toTensor
+
+  let sol = least_squares_solver(a, b)
+  echo sol
+
+block:
+  let A = [
+    [0f, 1f],
+    [1f, 1f],
+    [2f, 1f],
+    [3f, 1f]
+  ].toTensor
+
+  let y = [-1f, 0.2, 0.9, 2.1].toTensor
+
+  let sol = least_squares_solver(A, y)
+  echo sol