Skip to content

Commit 38e6166

Browse files
AndrovTvondele
authored andcommitted
Use block sparse input for the first layer.
Use block sparse input for the first fully connected layer on architectures with at least SSSE3. Depending on the CPU architecture, this yields a speedup of up to 10%, e.g. ``` Result of 100 runs of 'bench 16 1 13 default depth NNUE' base (...ockfish-base) = 959345 +/- 7477 test (...ckfish-patch) = 1054340 +/- 9640 diff = +94995 +/- 3999 speedup = +0.0990 P(speedup > 0) = 1.0000 CPU: 8 x AMD Ryzen 7 5700U with Radeon Graphics Hyperthreading: on ``` Passed STC: https://tests.stockfishchess.org/tests/view/6485aa0965ffe077ca12409c LLR: 2.93 (-2.94,2.94) <0.00,2.00> Total: 8864 W: 2479 L: 2223 D: 4162 Ptnml(0-2): 13, 829, 2504, 1061, 25 This commit includes a net with reordered weights, to increase the likelihood of block sparse inputs, but otherwise equivalent to the previous master net (nn-ea57bea57e32.nnue). Activation data collected with https://github.com/AndrovT/Stockfish/tree/log-activations, running bench 16 1 13 varied_1000.epd depth NNUE on this data. Net parameters permuted with https://gist.github.com/AndrovT/9e3fbaebb7082734dc84d27e02094cb3. closes #4612 No functional change
1 parent b7ee729 commit 38e6166

File tree

4 files changed

+290
-2
lines changed

4 files changed

+290
-2
lines changed

AUTHORS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ Norman Schmidt (FireFather)
159159
notruck
160160
Ofek Shochat (OfekShochat, ghostway)
161161
Ondrej Mosnáček (WOnder93)
162+
Ondřej Mišina (AndrovT)
162163
Oskar Werkelin Ahlin
163164
Pablo Vazquez
164165
Panthee

src/evaluate.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ namespace Eval {
3939
// The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
4040
// for the build process (profile-build and fishtest) to work. Do not change the
4141
// name of the macro, as it is used in the Makefile.
42-
#define EvalFileDefaultName "nn-ea57bea57e32.nnue"
42+
#define EvalFileDefaultName "nn-fdc1d0fe6455.nnue"
4343

4444
namespace NNUE {
4545

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
/*
2+
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
3+
Copyright (C) 2004-2023 The Stockfish developers (see AUTHORS file)
4+
5+
Stockfish is free software: you can redistribute it and/or modify
6+
it under the terms of the GNU General Public License as published by
7+
the Free Software Foundation, either version 3 of the License, or
8+
(at your option) any later version.
9+
10+
Stockfish is distributed in the hope that it will be useful,
11+
but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
GNU General Public License for more details.
14+
15+
You should have received a copy of the GNU General Public License
16+
along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
*/
18+
19+
// Definition of layer AffineTransformSparseInput of NNUE evaluation function
20+
21+
#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
22+
#define NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
23+
24+
#include <iostream>
25+
#include <algorithm>
26+
#include <array>
27+
#include <type_traits>
28+
#include "../nnue_common.h"
29+
#include "affine_transform.h"
30+
#include "simd.h"
31+
32+
/*
33+
This file contains the definition for a fully connected layer (aka affine transform) with block sparse input.
34+
*/
35+
36+
namespace Stockfish::Eval::NNUE::Layers {
37+
#if defined(__GNUC__) // GCC, Clang, ICC
38+
39+
static inline IndexType lsb_(std::uint32_t b) {
40+
assert(b);
41+
return IndexType(__builtin_ctzl(b));
42+
}
43+
44+
#elif defined(_MSC_VER) // MSVC
45+
46+
static inline IndexType lsb_(std::uint32_t b) {
47+
assert(b);
48+
unsigned long idx;
49+
_BitScanForward(&idx, b);
50+
return (IndexType) idx;
51+
}
52+
53+
#else // Compiler is neither GCC nor MSVC compatible
54+
55+
#error "Compiler not supported."
56+
57+
#endif
58+
59+
60+
#if defined(USE_SSSE3)
61+
alignas(CacheLineSize) static inline const std::array<std::array<std::uint16_t, 8>, 256> lookup_indices = [](){
62+
std::array<std::array<std::uint16_t, 8>, 256> v{};
63+
for (int i = 0; i < 256; ++i)
64+
{
65+
int j = i;
66+
int k = 0;
67+
while(j)
68+
{
69+
const IndexType lsbIndex = lsb_(std::uint32_t(j));
70+
j &= j - 1;
71+
v[i][k] = lsbIndex;
72+
++k;
73+
}
74+
}
75+
return v;
76+
}();
77+
alignas(CacheLineSize) static inline const std::array<unsigned, 256> lookup_count = [](){
78+
std::array<unsigned, 256> v;
79+
for (int i = 0; i < 256; ++i)
80+
{
81+
int j = i;
82+
int k = 0;
83+
while(j)
84+
{
85+
j &= j - 1;
86+
++k;
87+
}
88+
v[i] = k;
89+
}
90+
return v;
91+
}();
92+
93+
// Find indices of nonzero numbers in an int32_t array
94+
template<const IndexType InputDimensions>
95+
void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) {
96+
#if defined (USE_AVX512)
97+
using vec_t = __m512i;
98+
#define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
99+
#elif defined (USE_AVX2)
100+
using vec_t = __m256i;
101+
#define vec_nnz(a) _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(a, _mm256_setzero_si256()))
102+
#elif defined (USE_SSSE3)
103+
using vec_t = __m128i;
104+
#define vec_nnz(a) _mm_movemask_ps((__m128)_mm_cmpgt_epi32(a, _mm_setzero_si128()))
105+
#endif
106+
constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(std::int32_t);
107+
// Inputs are processed InputSimdWidth at a time and outputs are processed 8 at a time so we process in chunks of max(InputSimdWidth, 8)
108+
constexpr IndexType ChunkSize = std::max<IndexType>(InputSimdWidth, 8);
109+
constexpr IndexType NumChunks = InputDimensions / ChunkSize;
110+
constexpr IndexType InputsPerChunk = ChunkSize / InputSimdWidth;
111+
constexpr IndexType OutputsPerChunk = ChunkSize / 8;
112+
113+
const auto inputVector = reinterpret_cast<const vec_t*>(input);
114+
IndexType count = 0;
115+
__m128i base = _mm_set1_epi16(0);
116+
__m128i increment = _mm_set1_epi16(8);
117+
for (IndexType i = 0; i < NumChunks; ++i)
118+
{
119+
// bitmask of nonzero values in this chunk
120+
unsigned nnz = 0;
121+
for (IndexType j = 0; j < InputsPerChunk; ++j)
122+
{
123+
const vec_t inputChunk = inputVector[i * InputsPerChunk + j];
124+
nnz |= (unsigned)vec_nnz(inputChunk) << (j * InputSimdWidth);
125+
}
126+
for (IndexType j = 0; j < OutputsPerChunk; ++j)
127+
{
128+
const auto lookup = (nnz >> (j * 8)) & 0xFF;
129+
const auto offsets = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&lookup_indices[lookup]));
130+
_mm_storeu_si128(reinterpret_cast<__m128i*>(out + count), _mm_add_epi16(base, offsets));
131+
count += lookup_count[lookup];
132+
base = _mm_add_epi16(base, increment);
133+
}
134+
}
135+
count_out = count;
136+
}
137+
# undef vec_nnz
138+
#endif
139+
140+
// Sparse input implementation
141+
template <IndexType InDims, IndexType OutDims>
142+
class AffineTransformSparseInput {
143+
public:
144+
// Input/output type
145+
// Input/output type
146+
using InputType = std::uint8_t;
147+
using OutputType = std::int32_t;
148+
149+
// Number of input/output dimensions
150+
static constexpr IndexType InputDimensions = InDims;
151+
static constexpr IndexType OutputDimensions = OutDims;
152+
153+
static_assert(OutputDimensions % 16 == 0, "Only implemented for OutputDimensions divisible by 16.");
154+
155+
static constexpr IndexType PaddedInputDimensions =
156+
ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
157+
static constexpr IndexType PaddedOutputDimensions =
158+
ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth);
159+
160+
#if defined (USE_SSSE3)
161+
static constexpr IndexType ChunkSize = 4;
162+
#else
163+
static constexpr IndexType ChunkSize = 1;
164+
#endif
165+
166+
using OutputBuffer = OutputType[PaddedOutputDimensions];
167+
168+
// Hash value embedded in the evaluation file
169+
static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
170+
std::uint32_t hashValue = 0xCC03DAE4u;
171+
hashValue += OutputDimensions;
172+
hashValue ^= prevHash >> 1;
173+
hashValue ^= prevHash << 31;
174+
return hashValue;
175+
}
176+
177+
static IndexType get_weight_index_scrambled(IndexType i)
178+
{
179+
return
180+
(i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize +
181+
i / PaddedInputDimensions * ChunkSize +
182+
i % ChunkSize;
183+
}
184+
185+
static IndexType get_weight_index(IndexType i)
186+
{
187+
#if defined (USE_SSSE3)
188+
return get_weight_index_scrambled(i);
189+
#else
190+
return i;
191+
#endif
192+
}
193+
194+
// Read network parameters
195+
bool read_parameters(std::istream& stream) {
196+
read_little_endian<BiasType>(stream, biases, OutputDimensions);
197+
for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
198+
weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
199+
200+
return !stream.fail();
201+
}
202+
203+
// Write network parameters
204+
bool write_parameters(std::ostream& stream) const {
205+
write_little_endian<BiasType>(stream, biases, OutputDimensions);
206+
207+
for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
208+
write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
209+
210+
return !stream.fail();
211+
}
212+
// Forward propagation
213+
const OutputType* propagate(
214+
const InputType* input, OutputType* output) const {
215+
216+
#if defined (USE_SSSE3)
217+
#if defined (USE_AVX512)
218+
using vec_t = __m512i;
219+
#define vec_setzero _mm512_setzero_si512
220+
#define vec_set_32 _mm512_set1_epi32
221+
#define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
222+
#elif defined (USE_AVX2)
223+
using vec_t = __m256i;
224+
#define vec_setzero _mm256_setzero_si256
225+
#define vec_set_32 _mm256_set1_epi32
226+
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
227+
#elif defined (USE_SSSE3)
228+
using vec_t = __m128i;
229+
#define vec_setzero _mm_setzero_si128
230+
#define vec_set_32 _mm_set1_epi32
231+
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
232+
#endif
233+
static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType);
234+
235+
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / ChunkSize;
236+
constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
237+
std::uint16_t nnz[NumChunks];
238+
IndexType count;
239+
240+
const auto input32 = reinterpret_cast<const std::int32_t*>(input);
241+
242+
// Find indices of nonzero 32bit blocks
243+
find_nnz<NumChunks>(input32, nnz, count);
244+
245+
const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
246+
vec_t acc[NumRegs];
247+
for (IndexType k = 0; k < NumRegs; ++k)
248+
acc[k] = biasvec[k];
249+
250+
for (IndexType j = 0; j < count; ++j)
251+
{
252+
const auto i = nnz[j];
253+
const vec_t in = vec_set_32(input32[i]);
254+
const auto col = reinterpret_cast<const vec_t*>(&weights[i * OutputDimensions * ChunkSize]);
255+
for (IndexType k = 0; k < NumRegs; ++k)
256+
vec_add_dpbusd_32(acc[k], in, col[k]);
257+
}
258+
259+
vec_t* outptr = reinterpret_cast<vec_t*>(output);
260+
for (IndexType k = 0; k < NumRegs; ++k)
261+
outptr[k] = acc[k];
262+
# undef vec_setzero
263+
# undef vec_set_32
264+
# undef vec_add_dpbusd_32
265+
#else
266+
// Use dense implementation for the other architectures.
267+
affine_transform_non_ssse3<
268+
InputDimensions,
269+
PaddedInputDimensions,
270+
OutputDimensions>(output, weights, biases, input);
271+
#endif
272+
273+
return output;
274+
}
275+
276+
private:
277+
using BiasType = OutputType;
278+
using WeightType = std::int8_t;
279+
280+
alignas(CacheLineSize) BiasType biases[OutputDimensions];
281+
alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
282+
};
283+
284+
} // namespace Stockfish::Eval::NNUE::Layers
285+
286+
#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED

src/nnue/nnue_architecture.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
#include "features/half_ka_v2_hm.h"
2929

30+
#include "layers/affine_transform_sparse_input.h"
3031
#include "layers/affine_transform.h"
3132
#include "layers/clipped_relu.h"
3233
#include "layers/sqr_clipped_relu.h"
@@ -48,7 +49,7 @@ struct Network
4849
static constexpr int FC_0_OUTPUTS = 15;
4950
static constexpr int FC_1_OUTPUTS = 32;
5051

51-
Layers::AffineTransform<TransformedFeatureDimensions, FC_0_OUTPUTS + 1> fc_0;
52+
Layers::AffineTransformSparseInput<TransformedFeatureDimensions, FC_0_OUTPUTS + 1> fc_0;
5253
Layers::SqrClippedReLU<FC_0_OUTPUTS + 1> ac_sqr_0;
5354
Layers::ClippedReLU<FC_0_OUTPUTS + 1> ac_0;
5455
Layers::AffineTransform<FC_0_OUTPUTS * 2, FC_1_OUTPUTS> fc_1;

0 commit comments

Comments
 (0)