Add CPU only option for Allie by reusing the BLAS backend from Leela.

manyoso · Jun 7, 2019 · fe6416e · fe6416e
1 parent 1be89bb
commit fe6416e
Show file tree

Hide file tree

Showing 26 changed files with 1,778 additions and 7 deletions.
diff --git a/lib/blas.pri b/lib/blas.pri
@@ -0,0 +1,2 @@
+DEFINES += USE_OPENBLAS
+LIBS += -lopenblas
diff --git a/lib/lib.pro b/lib/lib.pro
@@ -26,6 +26,7 @@ include(zlib.pri)
 PROTOS += $$PWD/proto/net.proto
 include(protobuf.pri)
 include(cuda.pri)
+include(blas.pri)
 
 CONFIG(release, debug|release) {
   CONFIG += optimize_full
@@ -59,11 +60,18 @@ HEADERS += \
     $$PWD/neural/network.h \
     $$PWD/neural/network_legacy.h \
     $$PWD/neural/nn_policy.h \
-    $$PWD/neural/policy_map.h \
     $$PWD/neural/weights_adapter.h \
+    $$PWD/neural/blas/blas.h \
+    $$PWD/neural/blas/convolution1.h \
+    $$PWD/neural/blas/fully_connected_layer.h \
+    $$PWD/neural/blas/se_unit.h \
+    $$PWD/neural/blas/winograd_convolution3.h \
     $$PWD/neural/cuda/cuda_common.h \
     $$PWD/neural/cuda/kernels.h \
     $$PWD/neural/cuda/layers.h \
+    $$PWD/neural/shared/activation.h \
+    $$PWD/neural/shared/policy_map.h \
+    $$PWD/neural/shared/winograd_filter.h \
     $$PWD/fathom/tbconfig.h \
     $$PWD/fathom/tbcore.h \
     $$PWD/fathom/tbprobe.h
@@ -87,10 +95,17 @@ SOURCES += \
     $$PWD/tb.cpp \
     $$PWD/uciengine.cpp \
     $$PWD/zobrist.cpp \
-    $$PWD/neural/cuda/layers.cpp \
-    $$PWD/neural/cuda/nn_cuda.cpp \
     $$PWD/neural/network_legacy.cpp \
     $$PWD/neural/loader.cpp \
     $$PWD/neural/nn_policy.cpp \
     $$PWD/neural/weights_adapter.cpp \
+    $$PWD/neural/blas/convolution1.cpp \
+    $$PWD/neural/blas/fully_connected_layer.cpp \
+    $$PWD/neural/blas/nn_blas.cpp \
+    $$PWD/neural/blas/se_unit.cpp \
+    $$PWD/neural/blas/winograd_convolution3.cpp \
+    $$PWD/neural/cuda/layers.cpp \
+    $$PWD/neural/cuda/nn_cuda.cpp \
+    $$PWD/neural/shared/activation.cpp \
+    $$PWD/neural/shared/winograd_filter.cpp \
     $$PWD/fathom/tbprobe.c
diff --git a/lib/neural/blas/README.md b/lib/neural/blas/README.md
@@ -0,0 +1,21 @@
+The files in this directory comprise the BLAS backend of Lc0.
+
+## License
+
+Leela Chess is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Leela Chess is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+**The source files of this directory are not covered by any additional
+permission.**
+
+
diff --git a/lib/neural/blas/blas.h b/lib/neural/blas/blas.h
@@ -0,0 +1,47 @@
+/*
+ This file is part of Leela Chess Zero.
+ Copyright (C) 2018 The LCZero Authors
+
+ Leela Chess is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Leela Chess is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+// Select the BLAS vendor based on defines
+
+#ifdef USE_MKL
+#include <mkl.h>
+#else
+
+#ifdef USE_OPENBLAS
+#include <cblas.h>
+
+// Specific openblas routines.
+extern "C" {
+int openblas_get_num_procs(void);
+void openblas_set_num_threads(int num_threads);
+char* openblas_get_corename(void);
+char* openblas_get_config(void);
+}
+
+#else
+
+#ifdef __APPLE__
+#include <Accelerate/Accelerate.h>
+#define USE_ACCELERATE
+#endif
+
+#endif  // USE_OPENBLAS
+
+#endif  // USE_MKL
diff --git a/lib/neural/blas/convolution1.cpp b/lib/neural/blas/convolution1.cpp
@@ -0,0 +1,65 @@
+/*
+ This file is part of Leela Chess Zero.
+ Copyright (C) 2018 The LCZero Authors
+
+ Leela Chess is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Leela Chess is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "neural/blas/convolution1.h"
+#include "neural/blas/blas.h"
+
+namespace lczero {
+
+void Convolution1::Forward(const size_t batch_size, const size_t input_channels,
+                           const size_t output_channels, const float* input,
+                           const float* weights, float* output) {
+  for (size_t i = 0; i < batch_size; i++) {
+    // C←αAB + βC
+    // M Number of rows in matrices A and C.
+    // N Number of columns in matrices B and C.
+    // K Number of columns in matrix A; number of rows in matrix B.
+    // lda The size of the first dimension of matrix A; if you are
+    // passing a matrix A[m][n], the value should be m.
+    //    cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+    //                ldb, beta, C, N);
+
+    //             C                          A                     B
+    //
+    //           outputs       :=          weights        x      input
+    //
+    //   cols:  kSquares (N)         input_channels (K)        kSquares(N)
+    //
+    //   rows:  output_channels (M)   output_channels (M)  input_channels (K)
+
+    const float* batch_input = input + i * kSquares * input_channels;
+    float* batch_output = output + i * kSquares * output_channels;
+
+    cblas_sgemm(CblasRowMajor,         // Row major formar
+                CblasNoTrans,          // A not transposed
+                CblasNoTrans,          // B not transposed
+                (int)output_channels,  // M
+                kSquares,              // N
+                (int)input_channels,   // K
+                1.0f,                  // Alpha
+                weights,               // A
+                (int)input_channels,   // lda, leading rank of A
+                batch_input,           // B
+                kSquares,              // ldb, leading rank of B
+                0.0f,                  // beta
+                batch_output,          // C
+                kSquares);             // ldc, leading rank of B
+  }
+}
+
+}  // namespace lczero
diff --git a/lib/neural/blas/convolution1.h b/lib/neural/blas/convolution1.h
@@ -0,0 +1,41 @@
+/*
+ This file is part of Leela Chess Zero.
+ Copyright (C) 2018 The LCZero Authors
+
+ Leela Chess is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Leela Chess is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+namespace lczero {
+
+// Convolution 1x1
+class Convolution1 {
+ public:
+  Convolution1() = delete;
+
+  // Batched forward inference.
+  static void Forward(const size_t batch_size, const size_t input_channels,
+                      const size_t output_channels, const float* input,
+                      const float* weights, float* output);
+
+ private:
+  static constexpr auto kWidth = 8;
+  static constexpr auto kHeight = 8;
+  static constexpr auto kSquares = kWidth * kHeight;
+};
+}  // namespace lczero
diff --git a/lib/neural/blas/fully_connected_layer.cpp b/lib/neural/blas/fully_connected_layer.cpp
@@ -0,0 +1,110 @@
+/*
+ This file is part of Leela Chess Zero.
+ Copyright (C) 2018 The LCZero Authors
+
+ Leela Chess is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Leela Chess is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "neural/blas/fully_connected_layer.h"
+#include "neural/blas/blas.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+
+namespace lczero {
+
+void FullyConnectedLayer::Forward1D(size_t batch_size, const size_t input_size,
+                                    const size_t output_size,
+                                    const float* inputs, const float* weights,
+                                    const float* biases, bool apply_relu,
+                                    float* outputs) {
+  if (batch_size == 1) {
+    // Just a matrix-vector multiplication
+    //
+    //             C                A                     B
+    //
+    //         outputs    :=     weights      x       inputs
+    //
+    //   cols:   1               input_size            1
+    //
+    //   rows  output_size      output_size          input_size
+    //
+
+    cblas_sgemv(CblasRowMajor, CblasNoTrans,
+                // M     K
+                (int)output_size, (int)input_size, 1.0f, weights,
+                (int)input_size, inputs, 1, 0.0f, outputs, 1);
+  } else {
+    // more columns, matrix-matrix multiplication
+    //
+    //             C                     A                         B
+    //
+    //            outputs      :=       weights        x         inputs
+    //
+    //   cols:   batch_size (N)       input_size  (K)          batch_size (N)
+    //
+    //   rows  output_size (M)        output_size (M)         input_size (K)
+    //
+
+    // C←αAB + βC
+    // M Number of rows in matrices A and C.
+    // N Number of columns in matrices B and C.
+    // K Number of columns in matrix A; number of rows in matrix B.
+    // lda The size of the first dimension of matrix A; if you are
+    // passing a matrix A[m][n], the value should be m.
+    //    cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+    //                ldb, beta, C, N);
+
+    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+                (int)output_size,   // M
+                (int)batch_size,    // N
+                (int)input_size,    // K
+                1.0f,               // alpha
+                weights,            // A
+                (int)input_size,    // lda, leading rank of A
+                inputs,             // B
+                (int)input_size,    // ldb, leading rank of B
+                0.0f,               // beta
+                outputs,            // C
+                (int)output_size);  // ldc, leading rank of C
+  }
+  if (apply_relu) {
+    for (size_t i = 0; i < batch_size; i++) {
+      float* batch_outputs = outputs + i * output_size;
+      for (size_t o = 0; o < output_size; o++) {
+        float val = biases[o] + batch_outputs[o];
+        batch_outputs[o] = val >= 0 ? val : 0;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < batch_size; i++) {
+      float* batch_outputs = outputs + i * output_size;
+      for (size_t o = 0; o < output_size; o++) {
+        batch_outputs[o] += biases[o];
+      }
+    }
+  }
+}
+
+float FullyConnectedLayer::Forward0D(const size_t size, const float* x,
+                                     const float* y) {
+  // A scalar product, also known as a dot-product.
+  // float cblas_sdot(const int N, const float *X, const int incX, const float
+  // *Y,
+  // const int incY);
+  return cblas_sdot((int)size, x, 1, y, 1);
+}
+
+}  // namespace lczero
diff --git a/lib/neural/blas/fully_connected_layer.h b/lib/neural/blas/fully_connected_layer.h
@@ -0,0 +1,42 @@
+/*
+ This file is part of Leela Chess Zero.
+ Copyright (C) 2018 The LCZero Authors
+
+ Leela Chess is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Leela Chess is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+namespace lczero {
+
+class FullyConnectedLayer {
+ public:
+  FullyConnectedLayer() = delete;
+
+  // Forward inference, batched, from input_size to output_size
+  static void Forward1D(const size_t batch_size, const size_t input_size,
+                        const size_t output_size, const float* input,
+                        const float* weights, const float* biases,
+                        bool apply_relu, float* output);
+
+  // Forward inference, no batched, from input_size to scalar
+  static float Forward0D(const size_t input_size, const float* input,
+                         const float* weights);
+
+};
+
+}  // namespace lczero