Skip to content

Commit

Permalink
Add CPU only option for Allie by reusing the BLAS backend from Leela.
Browse files Browse the repository at this point in the history
  • Loading branch information
manyoso committed Jun 7, 2019
1 parent 1be89bb commit fe6416e
Show file tree
Hide file tree
Showing 26 changed files with 1,778 additions and 7 deletions.
2 changes: 2 additions & 0 deletions lib/blas.pri
@@ -0,0 +1,2 @@
DEFINES += USE_OPENBLAS
LIBS += -lopenblas
21 changes: 18 additions & 3 deletions lib/lib.pro
Expand Up @@ -26,6 +26,7 @@ include(zlib.pri)
PROTOS += $$PWD/proto/net.proto
include(protobuf.pri)
include(cuda.pri)
include(blas.pri)

CONFIG(release, debug|release) {
CONFIG += optimize_full
Expand Down Expand Up @@ -59,11 +60,18 @@ HEADERS += \
$$PWD/neural/network.h \
$$PWD/neural/network_legacy.h \
$$PWD/neural/nn_policy.h \
$$PWD/neural/policy_map.h \
$$PWD/neural/weights_adapter.h \
$$PWD/neural/blas/blas.h \
$$PWD/neural/blas/convolution1.h \
$$PWD/neural/blas/fully_connected_layer.h \
$$PWD/neural/blas/se_unit.h \
$$PWD/neural/blas/winograd_convolution3.h \
$$PWD/neural/cuda/cuda_common.h \
$$PWD/neural/cuda/kernels.h \
$$PWD/neural/cuda/layers.h \
$$PWD/neural/shared/activation.h \
$$PWD/neural/shared/policy_map.h \
$$PWD/neural/shared/winograd_filter.h \
$$PWD/fathom/tbconfig.h \
$$PWD/fathom/tbcore.h \
$$PWD/fathom/tbprobe.h
Expand All @@ -87,10 +95,17 @@ SOURCES += \
$$PWD/tb.cpp \
$$PWD/uciengine.cpp \
$$PWD/zobrist.cpp \
$$PWD/neural/cuda/layers.cpp \
$$PWD/neural/cuda/nn_cuda.cpp \
$$PWD/neural/network_legacy.cpp \
$$PWD/neural/loader.cpp \
$$PWD/neural/nn_policy.cpp \
$$PWD/neural/weights_adapter.cpp \
$$PWD/neural/blas/convolution1.cpp \
$$PWD/neural/blas/fully_connected_layer.cpp \
$$PWD/neural/blas/nn_blas.cpp \
$$PWD/neural/blas/se_unit.cpp \
$$PWD/neural/blas/winograd_convolution3.cpp \
$$PWD/neural/cuda/layers.cpp \
$$PWD/neural/cuda/nn_cuda.cpp \
$$PWD/neural/shared/activation.cpp \
$$PWD/neural/shared/winograd_filter.cpp \
$$PWD/fathom/tbprobe.c
21 changes: 21 additions & 0 deletions lib/neural/blas/README.md
@@ -0,0 +1,21 @@
The files in this directory comprise the BLAS backend of Lc0.

## License

Leela Chess is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Leela Chess is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Leela Chess. If not, see <http://www.gnu.org/licenses/>.

**The source files of this directory are not covered by any additional
permission.**


47 changes: 47 additions & 0 deletions lib/neural/blas/blas.h
@@ -0,0 +1,47 @@
/*
This file is part of Leela Chess Zero.
Copyright (C) 2018 The LCZero Authors
Leela Chess is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Leela Chess is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Leela Chess. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

// Select the BLAS vendor based on defines

#ifdef USE_MKL
#include <mkl.h>
#else

#ifdef USE_OPENBLAS
#include <cblas.h>

// Specific openblas routines.
extern "C" {
int openblas_get_num_procs(void);
void openblas_set_num_threads(int num_threads);
char* openblas_get_corename(void);
char* openblas_get_config(void);
}

#else

#ifdef __APPLE__
#include <Accelerate/Accelerate.h>
#define USE_ACCELERATE
#endif

#endif // USE_OPENBLAS

#endif // USE_MKL
65 changes: 65 additions & 0 deletions lib/neural/blas/convolution1.cpp
@@ -0,0 +1,65 @@
/*
This file is part of Leela Chess Zero.
Copyright (C) 2018 The LCZero Authors
Leela Chess is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Leela Chess is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Leela Chess. If not, see <http://www.gnu.org/licenses/>.
*/

#include "neural/blas/convolution1.h"
#include "neural/blas/blas.h"

namespace lczero {

void Convolution1::Forward(const size_t batch_size, const size_t input_channels,
const size_t output_channels, const float* input,
const float* weights, float* output) {
for (size_t i = 0; i < batch_size; i++) {
// C←αAB + βC
// M Number of rows in matrices A and C.
// N Number of columns in matrices B and C.
// K Number of columns in matrix A; number of rows in matrix B.
// lda The size of the first dimension of matrix A; if you are
// passing a matrix A[m][n], the value should be m.
// cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
// ldb, beta, C, N);

// C A B
//
// outputs := weights x input
//
// cols: kSquares (N) input_channels (K) kSquares(N)
//
// rows: output_channels (M) output_channels (M) input_channels (K)

const float* batch_input = input + i * kSquares * input_channels;
float* batch_output = output + i * kSquares * output_channels;

cblas_sgemm(CblasRowMajor, // Row major formar
CblasNoTrans, // A not transposed
CblasNoTrans, // B not transposed
(int)output_channels, // M
kSquares, // N
(int)input_channels, // K
1.0f, // Alpha
weights, // A
(int)input_channels, // lda, leading rank of A
batch_input, // B
kSquares, // ldb, leading rank of B
0.0f, // beta
batch_output, // C
kSquares); // ldc, leading rank of B
}
}

} // namespace lczero
41 changes: 41 additions & 0 deletions lib/neural/blas/convolution1.h
@@ -0,0 +1,41 @@
/*
This file is part of Leela Chess Zero.
Copyright (C) 2018 The LCZero Authors
Leela Chess is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Leela Chess is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Leela Chess. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include <cstddef>
#include <vector>

namespace lczero {

// Convolution 1x1
class Convolution1 {
public:
Convolution1() = delete;

// Batched forward inference.
static void Forward(const size_t batch_size, const size_t input_channels,
const size_t output_channels, const float* input,
const float* weights, float* output);

private:
static constexpr auto kWidth = 8;
static constexpr auto kHeight = 8;
static constexpr auto kSquares = kWidth * kHeight;
};
} // namespace lczero
110 changes: 110 additions & 0 deletions lib/neural/blas/fully_connected_layer.cpp
@@ -0,0 +1,110 @@
/*
This file is part of Leela Chess Zero.
Copyright (C) 2018 The LCZero Authors
Leela Chess is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Leela Chess is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Leela Chess. If not, see <http://www.gnu.org/licenses/>.
*/

#include "neural/blas/fully_connected_layer.h"
#include "neural/blas/blas.h"

#include <algorithm>
#include <cassert>
#include <cmath>

namespace lczero {

void FullyConnectedLayer::Forward1D(size_t batch_size, const size_t input_size,
const size_t output_size,
const float* inputs, const float* weights,
const float* biases, bool apply_relu,
float* outputs) {
if (batch_size == 1) {
// Just a matrix-vector multiplication
//
// C A B
//
// outputs := weights x inputs
//
// cols: 1 input_size 1
//
// rows output_size output_size input_size
//

cblas_sgemv(CblasRowMajor, CblasNoTrans,
// M K
(int)output_size, (int)input_size, 1.0f, weights,
(int)input_size, inputs, 1, 0.0f, outputs, 1);
} else {
// more columns, matrix-matrix multiplication
//
// C A B
//
// outputs := weights x inputs
//
// cols: batch_size (N) input_size (K) batch_size (N)
//
// rows output_size (M) output_size (M) input_size (K)
//

// C←αAB + βC
// M Number of rows in matrices A and C.
// N Number of columns in matrices B and C.
// K Number of columns in matrix A; number of rows in matrix B.
// lda The size of the first dimension of matrix A; if you are
// passing a matrix A[m][n], the value should be m.
// cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
// ldb, beta, C, N);

cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
(int)output_size, // M
(int)batch_size, // N
(int)input_size, // K
1.0f, // alpha
weights, // A
(int)input_size, // lda, leading rank of A
inputs, // B
(int)input_size, // ldb, leading rank of B
0.0f, // beta
outputs, // C
(int)output_size); // ldc, leading rank of C
}
if (apply_relu) {
for (size_t i = 0; i < batch_size; i++) {
float* batch_outputs = outputs + i * output_size;
for (size_t o = 0; o < output_size; o++) {
float val = biases[o] + batch_outputs[o];
batch_outputs[o] = val >= 0 ? val : 0;
}
}
} else {
for (size_t i = 0; i < batch_size; i++) {
float* batch_outputs = outputs + i * output_size;
for (size_t o = 0; o < output_size; o++) {
batch_outputs[o] += biases[o];
}
}
}
}

float FullyConnectedLayer::Forward0D(const size_t size, const float* x,
const float* y) {
// A scalar product, also known as a dot-product.
// float cblas_sdot(const int N, const float *X, const int incX, const float
// *Y,
// const int incY);
return cblas_sdot((int)size, x, 1, y, 1);
}

} // namespace lczero
42 changes: 42 additions & 0 deletions lib/neural/blas/fully_connected_layer.h
@@ -0,0 +1,42 @@
/*
This file is part of Leela Chess Zero.
Copyright (C) 2018 The LCZero Authors
Leela Chess is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Leela Chess is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Leela Chess. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include <cstddef>
#include <vector>

namespace lczero {

class FullyConnectedLayer {
public:
FullyConnectedLayer() = delete;

// Forward inference, batched, from input_size to output_size
static void Forward1D(const size_t batch_size, const size_t input_size,
const size_t output_size, const float* input,
const float* weights, const float* biases,
bool apply_relu, float* output);

// Forward inference, no batched, from input_size to scalar
static float Forward0D(const size_t input_size, const float* input,
const float* weights);

};

} // namespace lczero

0 comments on commit fe6416e

Please sign in to comment.