|
| 1 | +/* |
| 2 | + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 |
| 3 | + Copyright (C) 2004-2023 The Stockfish developers (see AUTHORS file) |
| 4 | +
|
| 5 | + Stockfish is free software: you can redistribute it and/or modify |
| 6 | + it under the terms of the GNU General Public License as published by |
| 7 | + the Free Software Foundation, either version 3 of the License, or |
| 8 | + (at your option) any later version. |
| 9 | +
|
| 10 | + Stockfish is distributed in the hope that it will be useful, |
| 11 | + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | + GNU General Public License for more details. |
| 14 | +
|
| 15 | + You should have received a copy of the GNU General Public License |
| 16 | + along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 17 | +*/ |
| 18 | + |
| 19 | +// Definition of layer AffineTransformSparseInput of NNUE evaluation function |
| 20 | + |
| 21 | +#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED |
| 22 | +#define NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED |
| 23 | + |
| 24 | +#include <iostream> |
| 25 | +#include <algorithm> |
| 26 | +#include <array> |
| 27 | +#include <type_traits> |
| 28 | +#include "../nnue_common.h" |
| 29 | +#include "affine_transform.h" |
| 30 | +#include "simd.h" |
| 31 | + |
| 32 | +/* |
| 33 | + This file contains the definition for a fully connected layer (aka affine transform) with block sparse input. |
| 34 | +*/ |
| 35 | + |
| 36 | +namespace Stockfish::Eval::NNUE::Layers { |
| 37 | +#if defined(__GNUC__) // GCC, Clang, ICC |
| 38 | + |
| 39 | + static inline IndexType lsb_(std::uint32_t b) { |
| 40 | + assert(b); |
| 41 | + return IndexType(__builtin_ctzl(b)); |
| 42 | + } |
| 43 | + |
| 44 | +#elif defined(_MSC_VER) // MSVC |
| 45 | + |
| 46 | + static inline IndexType lsb_(std::uint32_t b) { |
| 47 | + assert(b); |
| 48 | + unsigned long idx; |
| 49 | + _BitScanForward(&idx, b); |
| 50 | + return (IndexType) idx; |
| 51 | + } |
| 52 | + |
| 53 | +#else // Compiler is neither GCC nor MSVC compatible |
| 54 | + |
| 55 | +#error "Compiler not supported." |
| 56 | + |
| 57 | +#endif |
| 58 | + |
| 59 | + |
| 60 | +#if defined(USE_SSSE3) |
| 61 | + alignas(CacheLineSize) static inline const std::array<std::array<std::uint16_t, 8>, 256> lookup_indices = [](){ |
| 62 | + std::array<std::array<std::uint16_t, 8>, 256> v{}; |
| 63 | + for (int i = 0; i < 256; ++i) |
| 64 | + { |
| 65 | + int j = i; |
| 66 | + int k = 0; |
| 67 | + while(j) |
| 68 | + { |
| 69 | + const IndexType lsbIndex = lsb_(std::uint32_t(j)); |
| 70 | + j &= j - 1; |
| 71 | + v[i][k] = lsbIndex; |
| 72 | + ++k; |
| 73 | + } |
| 74 | + } |
| 75 | + return v; |
| 76 | + }(); |
| 77 | + alignas(CacheLineSize) static inline const std::array<unsigned, 256> lookup_count = [](){ |
| 78 | + std::array<unsigned, 256> v; |
| 79 | + for (int i = 0; i < 256; ++i) |
| 80 | + { |
| 81 | + int j = i; |
| 82 | + int k = 0; |
| 83 | + while(j) |
| 84 | + { |
| 85 | + j &= j - 1; |
| 86 | + ++k; |
| 87 | + } |
| 88 | + v[i] = k; |
| 89 | + } |
| 90 | + return v; |
| 91 | + }(); |
| 92 | + |
| 93 | + // Find indices of nonzero numbers in an int32_t array |
| 94 | + template<const IndexType InputDimensions> |
| 95 | + void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) { |
| 96 | +#if defined (USE_AVX512) |
| 97 | + using vec_t = __m512i; |
| 98 | + #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512()) |
| 99 | +#elif defined (USE_AVX2) |
| 100 | + using vec_t = __m256i; |
| 101 | + #define vec_nnz(a) _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(a, _mm256_setzero_si256())) |
| 102 | +#elif defined (USE_SSSE3) |
| 103 | + using vec_t = __m128i; |
| 104 | + #define vec_nnz(a) _mm_movemask_ps((__m128)_mm_cmpgt_epi32(a, _mm_setzero_si128())) |
| 105 | +#endif |
| 106 | + constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(std::int32_t); |
| 107 | + // Inputs are processed InputSimdWidth at a time and outputs are processed 8 at a time so we process in chunks of max(InputSimdWidth, 8) |
| 108 | + constexpr IndexType ChunkSize = std::max<IndexType>(InputSimdWidth, 8); |
| 109 | + constexpr IndexType NumChunks = InputDimensions / ChunkSize; |
| 110 | + constexpr IndexType InputsPerChunk = ChunkSize / InputSimdWidth; |
| 111 | + constexpr IndexType OutputsPerChunk = ChunkSize / 8; |
| 112 | + |
| 113 | + const auto inputVector = reinterpret_cast<const vec_t*>(input); |
| 114 | + IndexType count = 0; |
| 115 | + __m128i base = _mm_set1_epi16(0); |
| 116 | + __m128i increment = _mm_set1_epi16(8); |
| 117 | + for (IndexType i = 0; i < NumChunks; ++i) |
| 118 | + { |
| 119 | + // bitmask of nonzero values in this chunk |
| 120 | + unsigned nnz = 0; |
| 121 | + for (IndexType j = 0; j < InputsPerChunk; ++j) |
| 122 | + { |
| 123 | + const vec_t inputChunk = inputVector[i * InputsPerChunk + j]; |
| 124 | + nnz |= (unsigned)vec_nnz(inputChunk) << (j * InputSimdWidth); |
| 125 | + } |
| 126 | + for (IndexType j = 0; j < OutputsPerChunk; ++j) |
| 127 | + { |
| 128 | + const auto lookup = (nnz >> (j * 8)) & 0xFF; |
| 129 | + const auto offsets = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&lookup_indices[lookup])); |
| 130 | + _mm_storeu_si128(reinterpret_cast<__m128i*>(out + count), _mm_add_epi16(base, offsets)); |
| 131 | + count += lookup_count[lookup]; |
| 132 | + base = _mm_add_epi16(base, increment); |
| 133 | + } |
| 134 | + } |
| 135 | + count_out = count; |
| 136 | + } |
| 137 | +# undef vec_nnz |
| 138 | +#endif |
| 139 | + |
| 140 | + // Sparse input implementation |
| 141 | + template <IndexType InDims, IndexType OutDims> |
| 142 | + class AffineTransformSparseInput { |
| 143 | + public: |
| 144 | + // Input/output type |
| 145 | + // Input/output type |
| 146 | + using InputType = std::uint8_t; |
| 147 | + using OutputType = std::int32_t; |
| 148 | + |
| 149 | + // Number of input/output dimensions |
| 150 | + static constexpr IndexType InputDimensions = InDims; |
| 151 | + static constexpr IndexType OutputDimensions = OutDims; |
| 152 | + |
| 153 | + static_assert(OutputDimensions % 16 == 0, "Only implemented for OutputDimensions divisible by 16."); |
| 154 | + |
| 155 | + static constexpr IndexType PaddedInputDimensions = |
| 156 | + ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth); |
| 157 | + static constexpr IndexType PaddedOutputDimensions = |
| 158 | + ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth); |
| 159 | + |
| 160 | +#if defined (USE_SSSE3) |
| 161 | + static constexpr IndexType ChunkSize = 4; |
| 162 | +#else |
| 163 | + static constexpr IndexType ChunkSize = 1; |
| 164 | +#endif |
| 165 | + |
| 166 | + using OutputBuffer = OutputType[PaddedOutputDimensions]; |
| 167 | + |
| 168 | + // Hash value embedded in the evaluation file |
| 169 | + static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) { |
| 170 | + std::uint32_t hashValue = 0xCC03DAE4u; |
| 171 | + hashValue += OutputDimensions; |
| 172 | + hashValue ^= prevHash >> 1; |
| 173 | + hashValue ^= prevHash << 31; |
| 174 | + return hashValue; |
| 175 | + } |
| 176 | + |
| 177 | + static IndexType get_weight_index_scrambled(IndexType i) |
| 178 | + { |
| 179 | + return |
| 180 | + (i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize + |
| 181 | + i / PaddedInputDimensions * ChunkSize + |
| 182 | + i % ChunkSize; |
| 183 | + } |
| 184 | + |
| 185 | + static IndexType get_weight_index(IndexType i) |
| 186 | + { |
| 187 | +#if defined (USE_SSSE3) |
| 188 | + return get_weight_index_scrambled(i); |
| 189 | +#else |
| 190 | + return i; |
| 191 | +#endif |
| 192 | + } |
| 193 | + |
| 194 | + // Read network parameters |
| 195 | + bool read_parameters(std::istream& stream) { |
| 196 | + read_little_endian<BiasType>(stream, biases, OutputDimensions); |
| 197 | + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) |
| 198 | + weights[get_weight_index(i)] = read_little_endian<WeightType>(stream); |
| 199 | + |
| 200 | + return !stream.fail(); |
| 201 | + } |
| 202 | + |
| 203 | + // Write network parameters |
| 204 | + bool write_parameters(std::ostream& stream) const { |
| 205 | + write_little_endian<BiasType>(stream, biases, OutputDimensions); |
| 206 | + |
| 207 | + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) |
| 208 | + write_little_endian<WeightType>(stream, weights[get_weight_index(i)]); |
| 209 | + |
| 210 | + return !stream.fail(); |
| 211 | + } |
| 212 | + // Forward propagation |
| 213 | + const OutputType* propagate( |
| 214 | + const InputType* input, OutputType* output) const { |
| 215 | + |
| 216 | +#if defined (USE_SSSE3) |
| 217 | +#if defined (USE_AVX512) |
| 218 | + using vec_t = __m512i; |
| 219 | + #define vec_setzero _mm512_setzero_si512 |
| 220 | + #define vec_set_32 _mm512_set1_epi32 |
| 221 | + #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32 |
| 222 | +#elif defined (USE_AVX2) |
| 223 | + using vec_t = __m256i; |
| 224 | + #define vec_setzero _mm256_setzero_si256 |
| 225 | + #define vec_set_32 _mm256_set1_epi32 |
| 226 | + #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 |
| 227 | +#elif defined (USE_SSSE3) |
| 228 | + using vec_t = __m128i; |
| 229 | + #define vec_setzero _mm_setzero_si128 |
| 230 | + #define vec_set_32 _mm_set1_epi32 |
| 231 | + #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 |
| 232 | +#endif |
| 233 | + static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType); |
| 234 | + |
| 235 | + constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / ChunkSize; |
| 236 | + constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; |
| 237 | + std::uint16_t nnz[NumChunks]; |
| 238 | + IndexType count; |
| 239 | + |
| 240 | + const auto input32 = reinterpret_cast<const std::int32_t*>(input); |
| 241 | + |
| 242 | + // Find indices of nonzero 32bit blocks |
| 243 | + find_nnz<NumChunks>(input32, nnz, count); |
| 244 | + |
| 245 | + const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases); |
| 246 | + vec_t acc[NumRegs]; |
| 247 | + for (IndexType k = 0; k < NumRegs; ++k) |
| 248 | + acc[k] = biasvec[k]; |
| 249 | + |
| 250 | + for (IndexType j = 0; j < count; ++j) |
| 251 | + { |
| 252 | + const auto i = nnz[j]; |
| 253 | + const vec_t in = vec_set_32(input32[i]); |
| 254 | + const auto col = reinterpret_cast<const vec_t*>(&weights[i * OutputDimensions * ChunkSize]); |
| 255 | + for (IndexType k = 0; k < NumRegs; ++k) |
| 256 | + vec_add_dpbusd_32(acc[k], in, col[k]); |
| 257 | + } |
| 258 | + |
| 259 | + vec_t* outptr = reinterpret_cast<vec_t*>(output); |
| 260 | + for (IndexType k = 0; k < NumRegs; ++k) |
| 261 | + outptr[k] = acc[k]; |
| 262 | +# undef vec_setzero |
| 263 | +# undef vec_set_32 |
| 264 | +# undef vec_add_dpbusd_32 |
| 265 | +#else |
| 266 | + // Use dense implementation for the other architectures. |
| 267 | + affine_transform_non_ssse3< |
| 268 | + InputDimensions, |
| 269 | + PaddedInputDimensions, |
| 270 | + OutputDimensions>(output, weights, biases, input); |
| 271 | +#endif |
| 272 | + |
| 273 | + return output; |
| 274 | + } |
| 275 | + |
| 276 | + private: |
| 277 | + using BiasType = OutputType; |
| 278 | + using WeightType = std::int8_t; |
| 279 | + |
| 280 | + alignas(CacheLineSize) BiasType biases[OutputDimensions]; |
| 281 | + alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions]; |
| 282 | + }; |
| 283 | + |
| 284 | +} // namespace Stockfish::Eval::NNUE::Layers |
| 285 | + |
| 286 | +#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED |
0 commit comments