diff --git a/src/evaluate.h b/src/evaluate.h index 57a7687d776..1934c9bddf0 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -39,7 +39,7 @@ namespace Eval { // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro, as it is used in the Makefile. - #define EvalFileDefaultName "nn-ac07bd334b62.nnue" + #define EvalFileDefaultName "nn-6877cd24400e.nnue" namespace NNUE { diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index 862b2003388..0fd58462b78 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -148,22 +148,18 @@ namespace Stockfish::Eval::NNUE { #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) TransformedFeatureType transformedFeaturesUnaligned[ FeatureTransformer::BufferSize + alignment / sizeof(TransformedFeatureType)]; - char bufferUnaligned[Network::BufferSize + alignment]; auto* transformedFeatures = align_ptr_up(&transformedFeaturesUnaligned[0]); - auto* buffer = align_ptr_up(&bufferUnaligned[0]); #else alignas(alignment) TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize]; - alignas(alignment) char buffer[Network::BufferSize]; #endif ASSERT_ALIGNED(transformedFeatures, alignment); - ASSERT_ALIGNED(buffer, alignment); const std::size_t bucket = (pos.count() - 1) / 4; const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket); - const auto positional = network[bucket]->propagate(transformedFeatures, buffer)[0]; + const auto positional = network[bucket]->propagate(transformedFeatures); // Give more value to positional evaluation when adjusted flag is set if (adjusted) @@ -190,27 +186,20 @@ namespace Stockfish::Eval::NNUE { #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) TransformedFeatureType transformedFeaturesUnaligned[ FeatureTransformer::BufferSize + alignment / sizeof(TransformedFeatureType)]; - char bufferUnaligned[Network::BufferSize + alignment]; auto* transformedFeatures = align_ptr_up(&transformedFeaturesUnaligned[0]); - auto* buffer = align_ptr_up(&bufferUnaligned[0]); #else alignas(alignment) TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize]; - alignas(alignment) char buffer[Network::BufferSize]; #endif ASSERT_ALIGNED(transformedFeatures, alignment); - ASSERT_ALIGNED(buffer, alignment); NnueEvalTrace t{}; t.correctBucket = (pos.count() - 1) / 4; for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket) { - const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket); - const auto output = network[bucket]->propagate(transformedFeatures, buffer); - - int materialist = psqt; - int positional = output[0]; + const auto materialist = featureTransformer->transform(pos, transformedFeatures, bucket); + const auto positional = network[bucket]->propagate(transformedFeatures); t.psqt[bucket] = static_cast( materialist / OutputScale ); t.positional[bucket] = static_cast( positional / OutputScale ); diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 4e85a5fe4b1..22451915ba1 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -63,19 +63,17 @@ namespace Stockfish::Eval::NNUE::Layers { { # if defined(USE_SSE2) // At least a multiple of 16, with SSE2. - static_assert(PaddedInputDimensions % 16 == 0); - constexpr IndexType NumChunks = PaddedInputDimensions / 16; + constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; const __m128i Zeros = _mm_setzero_si128(); const auto inputVector = reinterpret_cast(input); # elif defined(USE_MMX) - static_assert(InputDimensions % 8 == 0); - constexpr IndexType NumChunks = InputDimensions / 8; + constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 8; const __m64 Zeros = _mm_setzero_si64(); const auto inputVector = reinterpret_cast(input); # elif defined(USE_NEON) - constexpr IndexType NumChunks = (InputDimensions + 15) / 16; + constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; const auto inputVector = reinterpret_cast(input); # endif @@ -150,24 +148,27 @@ namespace Stockfish::Eval::NNUE::Layers { } #endif - template + template class AffineTransform; // A specialization for large inputs. - template - class AffineTransform= 2*64-1)>> { + template + class AffineTransform(InDims, MaxSimdWidth) >= 2*64)>> { public: // Input/output type - using InputType = typename PreviousLayer::OutputType; + using InputType = std::uint8_t; using OutputType = std::int32_t; - static_assert(std::is_same::value, ""); // Number of input/output dimensions - static constexpr IndexType InputDimensions = PreviousLayer::OutputDimensions; + static constexpr IndexType InputDimensions = InDims; static constexpr IndexType OutputDimensions = OutDims; static constexpr IndexType PaddedInputDimensions = ceil_to_multiple(InputDimensions, MaxSimdWidth); + static constexpr IndexType PaddedOutputDimensions = + ceil_to_multiple(OutputDimensions, MaxSimdWidth); + + using OutputBuffer = OutputType[PaddedOutputDimensions]; static_assert(PaddedInputDimensions >= 128, "Something went wrong. This specialization should not have been chosen."); @@ -202,20 +203,12 @@ namespace Stockfish::Eval::NNUE::Layers { static_assert(OutputDimensions % NumOutputRegs == 0); - // Size of forward propagation buffer used in this layer - static constexpr std::size_t SelfBufferSize = - ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize); - - // Size of the forward propagation buffer used from the input layer to this layer - static constexpr std::size_t BufferSize = - PreviousLayer::BufferSize + SelfBufferSize; - // Hash value embedded in the evaluation file - static constexpr std::uint32_t get_hash_value() { + static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) { std::uint32_t hashValue = 0xCC03DAE4u; hashValue += OutputDimensions; - hashValue ^= PreviousLayer::get_hash_value() >> 1; - hashValue ^= PreviousLayer::get_hash_value() << 31; + hashValue ^= prevHash >> 1; + hashValue ^= prevHash << 31; return hashValue; } @@ -242,7 +235,6 @@ namespace Stockfish::Eval::NNUE::Layers { // Read network parameters bool read_parameters(std::istream& stream) { - if (!previousLayer.read_parameters(stream)) return false; for (std::size_t i = 0; i < OutputDimensions; ++i) biases[i] = read_little_endian(stream); @@ -254,7 +246,6 @@ namespace Stockfish::Eval::NNUE::Layers { // Write network parameters bool write_parameters(std::ostream& stream) const { - if (!previousLayer.write_parameters(stream)) return false; for (std::size_t i = 0; i < OutputDimensions; ++i) write_little_endian(stream, biases[i]); @@ -266,10 +257,7 @@ namespace Stockfish::Eval::NNUE::Layers { // Forward propagation const OutputType* propagate( - const TransformedFeatureType* transformedFeatures, char* buffer) const { - const auto input = previousLayer.propagate( - transformedFeatures, buffer + SelfBufferSize); - OutputType* output = reinterpret_cast(buffer); + const InputType* input, OutputType* output) const { #if defined (USE_AVX512) using acc_vec_t = __m512i; @@ -312,7 +300,6 @@ namespace Stockfish::Eval::NNUE::Layers { #if defined (USE_SSSE3) || defined (USE_NEON) const in_vec_t* invec = reinterpret_cast(input); - // Perform accumulation to registers for each big block for (IndexType bigBlock = 0; bigBlock < NumBigBlocks; ++bigBlock) { @@ -377,26 +364,28 @@ namespace Stockfish::Eval::NNUE::Layers { using BiasType = OutputType; using WeightType = std::int8_t; - PreviousLayer previousLayer; - alignas(CacheLineSize) BiasType biases[OutputDimensions]; alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions]; }; - template - class AffineTransform> { + template + class AffineTransform(InDims, MaxSimdWidth) < 2*64)>> { public: // Input/output type - using InputType = typename PreviousLayer::OutputType; + // Input/output type + using InputType = std::uint8_t; using OutputType = std::int32_t; - static_assert(std::is_same::value, ""); // Number of input/output dimensions - static constexpr IndexType InputDimensions = - PreviousLayer::OutputDimensions; + static constexpr IndexType InputDimensions = InDims; static constexpr IndexType OutputDimensions = OutDims; + static constexpr IndexType PaddedInputDimensions = - ceil_to_multiple(InputDimensions, MaxSimdWidth); + ceil_to_multiple(InputDimensions, MaxSimdWidth); + static constexpr IndexType PaddedOutputDimensions = + ceil_to_multiple(OutputDimensions, MaxSimdWidth); + + using OutputBuffer = OutputType[PaddedOutputDimensions]; static_assert(PaddedInputDimensions < 128, "Something went wrong. This specialization should not have been chosen."); @@ -405,20 +394,12 @@ namespace Stockfish::Eval::NNUE::Layers { static constexpr const IndexType InputSimdWidth = SimdWidth; #endif - // Size of forward propagation buffer used in this layer - static constexpr std::size_t SelfBufferSize = - ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize); - - // Size of the forward propagation buffer used from the input layer to this layer - static constexpr std::size_t BufferSize = - PreviousLayer::BufferSize + SelfBufferSize; - // Hash value embedded in the evaluation file - static constexpr std::uint32_t get_hash_value() { + static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) { std::uint32_t hashValue = 0xCC03DAE4u; hashValue += OutputDimensions; - hashValue ^= PreviousLayer::get_hash_value() >> 1; - hashValue ^= PreviousLayer::get_hash_value() << 31; + hashValue ^= prevHash >> 1; + hashValue ^= prevHash << 31; return hashValue; } @@ -441,7 +422,6 @@ namespace Stockfish::Eval::NNUE::Layers { // Read network parameters bool read_parameters(std::istream& stream) { - if (!previousLayer.read_parameters(stream)) return false; for (std::size_t i = 0; i < OutputDimensions; ++i) biases[i] = read_little_endian(stream); for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) @@ -452,7 +432,6 @@ namespace Stockfish::Eval::NNUE::Layers { // Write network parameters bool write_parameters(std::ostream& stream) const { - if (!previousLayer.write_parameters(stream)) return false; for (std::size_t i = 0; i < OutputDimensions; ++i) write_little_endian(stream, biases[i]); @@ -463,10 +442,7 @@ namespace Stockfish::Eval::NNUE::Layers { } // Forward propagation const OutputType* propagate( - const TransformedFeatureType* transformedFeatures, char* buffer) const { - const auto input = previousLayer.propagate( - transformedFeatures, buffer + SelfBufferSize); - const auto output = reinterpret_cast(buffer); + const InputType* input, OutputType* output) const { #if defined (USE_AVX2) using vec_t = __m256i; @@ -491,12 +467,11 @@ namespace Stockfish::Eval::NNUE::Layers { #if defined (USE_SSSE3) const auto inputVector = reinterpret_cast(input); - static_assert(InputDimensions % 8 == 0); static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1); if constexpr (OutputDimensions % OutputSimdWidth == 0) { - constexpr IndexType NumChunks = InputDimensions / 4; + constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4; constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; const auto input32 = reinterpret_cast(input); @@ -555,8 +530,6 @@ namespace Stockfish::Eval::NNUE::Layers { using BiasType = OutputType; using WeightType = std::int8_t; - PreviousLayer previousLayer; - alignas(CacheLineSize) BiasType biases[OutputDimensions]; alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions]; }; diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 0da5e821011..ffd2e3b76a9 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -26,51 +26,41 @@ namespace Stockfish::Eval::NNUE::Layers { // Clipped ReLU - template + template class ClippedReLU { public: // Input/output type - using InputType = typename PreviousLayer::OutputType; + using InputType = std::int32_t; using OutputType = std::uint8_t; - static_assert(std::is_same::value, ""); // Number of input/output dimensions - static constexpr IndexType InputDimensions = PreviousLayer::OutputDimensions; + static constexpr IndexType InputDimensions = InDims; static constexpr IndexType OutputDimensions = InputDimensions; static constexpr IndexType PaddedOutputDimensions = ceil_to_multiple(OutputDimensions, 32); - // Size of forward propagation buffer used in this layer - static constexpr std::size_t SelfBufferSize = - ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize); - - // Size of the forward propagation buffer used from the input layer to this layer - static constexpr std::size_t BufferSize = - PreviousLayer::BufferSize + SelfBufferSize; + using OutputBuffer = OutputType[PaddedOutputDimensions]; // Hash value embedded in the evaluation file - static constexpr std::uint32_t get_hash_value() { + static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) { std::uint32_t hashValue = 0x538D24C7u; - hashValue += PreviousLayer::get_hash_value(); + hashValue += prevHash; return hashValue; } // Read network parameters - bool read_parameters(std::istream& stream) { - return previousLayer.read_parameters(stream); + bool read_parameters(std::istream&) { + return true; } // Write network parameters - bool write_parameters(std::ostream& stream) const { - return previousLayer.write_parameters(stream); + bool write_parameters(std::ostream&) const { + return true; } // Forward propagation const OutputType* propagate( - const TransformedFeatureType* transformedFeatures, char* buffer) const { - const auto input = previousLayer.propagate( - transformedFeatures, buffer + SelfBufferSize); - const auto output = reinterpret_cast(buffer); + const InputType* input, OutputType* output) const { #if defined(USE_AVX2) if constexpr (InputDimensions % SimdWidth == 0) { @@ -191,9 +181,6 @@ namespace Stockfish::Eval::NNUE::Layers { return output; } - - private: - PreviousLayer previousLayer; }; } // namespace Stockfish::Eval::NNUE::Layers diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h deleted file mode 100644 index 8f526b745f7..00000000000 --- a/src/nnue/layers/input_slice.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file) - - Stockfish is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Stockfish is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -// NNUE evaluation function layer InputSlice definition - -#ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED -#define NNUE_LAYERS_INPUT_SLICE_H_INCLUDED - -#include "../nnue_common.h" - -namespace Stockfish::Eval::NNUE::Layers { - -// Input layer -template -class InputSlice { - public: - // Need to maintain alignment - static_assert(Offset % MaxSimdWidth == 0, ""); - - // Output type - using OutputType = TransformedFeatureType; - - // Output dimensionality - static constexpr IndexType OutputDimensions = OutDims; - - // Size of forward propagation buffer used from the input layer to this layer - static constexpr std::size_t BufferSize = 0; - - // Hash value embedded in the evaluation file - static constexpr std::uint32_t get_hash_value() { - std::uint32_t hashValue = 0xEC42E90Du; - hashValue ^= OutputDimensions ^ (Offset << 10); - return hashValue; - } - - // Read network parameters - bool read_parameters(std::istream& /*stream*/) { - return true; - } - - // Write network parameters - bool write_parameters(std::ostream& /*stream*/) const { - return true; - } - - // Forward propagation - const OutputType* propagate( - const TransformedFeatureType* transformedFeatures, - char* /*buffer*/) const { - return transformedFeatures + Offset; - } - - private: -}; - -} // namespace Stockfish::Eval::NNUE::Layers - -#endif // #ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h index 8867fac72fc..725b40fb43d 100644 --- a/src/nnue/nnue_architecture.h +++ b/src/nnue/nnue_architecture.h @@ -25,35 +25,106 @@ #include "features/half_ka_v2_hm.h" -#include "layers/input_slice.h" #include "layers/affine_transform.h" #include "layers/clipped_relu.h" -namespace Stockfish::Eval::NNUE { - - // Input features used in evaluation function - using FeatureSet = Features::HalfKAv2_hm; - - // Number of input feature dimensions after conversion - constexpr IndexType TransformedFeatureDimensions = 1024; - constexpr IndexType PSQTBuckets = 8; - constexpr IndexType LayerStacks = 8; - - namespace Layers { +#include "../misc.h" - // Define network structure - using InputLayer = InputSlice; - using HiddenLayer1 = ClippedReLU>; - using HiddenLayer2 = ClippedReLU>; - using OutputLayer = AffineTransform; - - } // namespace Layers - - using Network = Layers::OutputLayer; +namespace Stockfish::Eval::NNUE { - static_assert(TransformedFeatureDimensions % MaxSimdWidth == 0, ""); - static_assert(Network::OutputDimensions == 1, ""); - static_assert(std::is_same::value, ""); +// Input features used in evaluation function +using FeatureSet = Features::HalfKAv2_hm; + +// Number of input feature dimensions after conversion +constexpr IndexType TransformedFeatureDimensions = 1024; +constexpr IndexType PSQTBuckets = 8; +constexpr IndexType LayerStacks = 8; + +struct Network +{ + static constexpr int FC_0_OUTPUTS = 15; + static constexpr int FC_1_OUTPUTS = 32; + + Layers::AffineTransform fc_0; + Layers::ClippedReLU ac_0; + Layers::AffineTransform fc_1; + Layers::ClippedReLU ac_1; + Layers::AffineTransform fc_2; + + // Hash value embedded in the evaluation file + static constexpr std::uint32_t get_hash_value() { + // input slice hash + std::uint32_t hashValue = 0xEC42E90Du; + hashValue ^= TransformedFeatureDimensions * 2; + + hashValue = decltype(fc_0)::get_hash_value(hashValue); + hashValue = decltype(ac_0)::get_hash_value(hashValue); + hashValue = decltype(fc_1)::get_hash_value(hashValue); + hashValue = decltype(ac_1)::get_hash_value(hashValue); + hashValue = decltype(fc_2)::get_hash_value(hashValue); + + return hashValue; + } + + // Read network parameters + bool read_parameters(std::istream& stream) { + if (!fc_0.read_parameters(stream)) return false; + if (!ac_0.read_parameters(stream)) return false; + if (!fc_1.read_parameters(stream)) return false; + if (!ac_1.read_parameters(stream)) return false; + if (!fc_2.read_parameters(stream)) return false; + return true; + } + + // Read network parameters + bool write_parameters(std::ostream& stream) const { + if (!fc_0.write_parameters(stream)) return false; + if (!ac_0.write_parameters(stream)) return false; + if (!fc_1.write_parameters(stream)) return false; + if (!ac_1.write_parameters(stream)) return false; + if (!fc_2.write_parameters(stream)) return false; + return true; + } + + std::int32_t propagate(const TransformedFeatureType* transformedFeatures) + { + constexpr uint64_t alignment = CacheLineSize; + + struct Buffer + { + alignas(CacheLineSize) decltype(fc_0)::OutputBuffer fc_0_out; + alignas(CacheLineSize) decltype(ac_0)::OutputBuffer ac_0_out; + alignas(CacheLineSize) decltype(fc_1)::OutputBuffer fc_1_out; + alignas(CacheLineSize) decltype(ac_1)::OutputBuffer ac_1_out; + alignas(CacheLineSize) decltype(fc_2)::OutputBuffer fc_2_out; + }; + +#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) + char bufferRaw[sizeof(Buffer) + alignment]; + char* bufferRawAligned = align_ptr_up(&bufferRaw[0]); + Buffer& buffer = *(new (bufferRawAligned) Buffer); +#else + alignas(alignment) Buffer buffer; +#endif + + fc_0.propagate(transformedFeatures, buffer.fc_0_out); + ac_0.propagate(buffer.fc_0_out, buffer.ac_0_out); + fc_1.propagate(buffer.ac_0_out, buffer.fc_1_out); + ac_1.propagate(buffer.fc_1_out, buffer.ac_1_out); + fc_2.propagate(buffer.ac_1_out, buffer.fc_2_out); + + // buffer.fc_0_out[FC_0_OUTPUTS] is such that 1.0 is equal to 127*(1<(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) - { - __m512i sum0 = _mm512_load_si512(&reinterpret_cast - (accumulation[perspectives[p]])[j * 2 + 0]); - __m512i sum1 = _mm512_load_si512(&reinterpret_cast - (accumulation[perspectives[p]])[j * 2 + 1]); + const IndexType offset = (HalfDimensions / 2) * p; - _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control, - _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero))); - } - } - return psqt; +#if defined(USE_AVX512) - #elif defined(USE_AVX2) + constexpr IndexType OutputChunkSize = 512 / 8; + static_assert((HalfDimensions / 2) % OutputChunkSize == 0); + constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; - constexpr IndexType NumChunks = HalfDimensions / SimdWidth; - constexpr int Control = 0b11011000; - const __m256i Zero = _mm256_setzero_si256(); + const __m512i Zero = _mm512_setzero_si512(); + const __m512i One = _mm512_set1_epi16(127); + const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); - for (IndexType p = 0; p < 2; ++p) - { - const IndexType offset = HalfDimensions * p; - auto out = reinterpret_cast<__m256i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) + const __m512i* in0 = reinterpret_cast(&(accumulation[perspectives[p]][0])); + const __m512i* in1 = reinterpret_cast(&(accumulation[perspectives[p]][HalfDimensions / 2])); + __m512i* out = reinterpret_cast< __m512i*>(output + offset); + + for (IndexType j = 0; j < NumOutputChunks; j += 1) { - __m256i sum0 = _mm256_load_si256(&reinterpret_cast - (accumulation[perspectives[p]])[j * 2 + 0]); - __m256i sum1 = _mm256_load_si256(&reinterpret_cast - (accumulation[perspectives[p]])[j * 2 + 1]); + const __m512i sum0a = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 0], One), Zero); + const __m512i sum0b = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 1], One), Zero); + const __m512i sum1a = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 0], One), Zero); + const __m512i sum1b = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 1], One), Zero); - _mm256_store_si256(&out[j], _mm256_permute4x64_epi64( - _mm256_max_epi8(_mm256_packs_epi16(sum0, sum1), Zero), Control)); + const __m512i pa = _mm512_srli_epi16(_mm512_mullo_epi16(sum0a, sum1a), 7); + const __m512i pb = _mm512_srli_epi16(_mm512_mullo_epi16(sum0b, sum1b), 7); + + out[j] = _mm512_permutexvar_epi64(Control, _mm512_packs_epi16(pa, pb)); } - } - return psqt; - #elif defined(USE_SSE2) +#elif defined(USE_AVX2) - #ifdef USE_SSE41 - constexpr IndexType NumChunks = HalfDimensions / SimdWidth; - const __m128i Zero = _mm_setzero_si128(); - #else - constexpr IndexType NumChunks = HalfDimensions / SimdWidth; - const __m128i k0x80s = _mm_set1_epi8(-128); - #endif + constexpr IndexType OutputChunkSize = 256 / 8; + static_assert((HalfDimensions / 2) % OutputChunkSize == 0); + constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; - for (IndexType p = 0; p < 2; ++p) - { - const IndexType offset = HalfDimensions * p; - auto out = reinterpret_cast<__m128i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) + const __m256i Zero = _mm256_setzero_si256(); + const __m256i One = _mm256_set1_epi16(127); + constexpr int Control = 0b11011000; + + const __m256i* in0 = reinterpret_cast(&(accumulation[perspectives[p]][0])); + const __m256i* in1 = reinterpret_cast(&(accumulation[perspectives[p]][HalfDimensions / 2])); + __m256i* out = reinterpret_cast< __m256i*>(output + offset); + + for (IndexType j = 0; j < NumOutputChunks; j += 1) { - __m128i sum0 = _mm_load_si128(&reinterpret_cast - (accumulation[perspectives[p]])[j * 2 + 0]); - __m128i sum1 = _mm_load_si128(&reinterpret_cast - (accumulation[perspectives[p]])[j * 2 + 1]); - const __m128i packedbytes = _mm_packs_epi16(sum0, sum1); - - #ifdef USE_SSE41 - _mm_store_si128(&out[j], _mm_max_epi8(packedbytes, Zero)); - #else - _mm_store_si128(&out[j], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)); - #endif + const __m256i sum0a = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 0], One), Zero); + const __m256i sum0b = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 1], One), Zero); + const __m256i sum1a = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 0], One), Zero); + const __m256i sum1b = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 1], One), Zero); + + const __m256i pa = _mm256_srli_epi16(_mm256_mullo_epi16(sum0a, sum1a), 7); + const __m256i pb = _mm256_srli_epi16(_mm256_mullo_epi16(sum0b, sum1b), 7); + + out[j] = _mm256_permute4x64_epi64(_mm256_packs_epi16(pa, pb), Control); } - } - return psqt; - #elif defined(USE_MMX) +#elif defined(USE_SSE2) - constexpr IndexType NumChunks = HalfDimensions / SimdWidth; - const __m64 k0x80s = _mm_set1_pi8(-128); + constexpr IndexType OutputChunkSize = 128 / 8; + static_assert((HalfDimensions / 2) % OutputChunkSize == 0); + constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; - for (IndexType p = 0; p < 2; ++p) - { - const IndexType offset = HalfDimensions * p; - auto out = reinterpret_cast<__m64*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) + const __m128i Zero = _mm_setzero_si128(); + const __m128i One = _mm_set1_epi16(127); + + const __m128i* in0 = reinterpret_cast(&(accumulation[perspectives[p]][0])); + const __m128i* in1 = reinterpret_cast(&(accumulation[perspectives[p]][HalfDimensions / 2])); + __m128i* out = reinterpret_cast< __m128i*>(output + offset); + + for (IndexType j = 0; j < NumOutputChunks; j += 1) { - __m64 sum0 = *(&reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); - __m64 sum1 = *(&reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); - const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); - out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + const __m128i sum0a = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 0], One), Zero); + const __m128i sum0b = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 1], One), Zero); + const __m128i sum1a = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 0], One), Zero); + const __m128i sum1b = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 1], One), Zero); + + const __m128i pa = _mm_srli_epi16(_mm_mullo_epi16(sum0a, sum1a), 7); + const __m128i pb = _mm_srli_epi16(_mm_mullo_epi16(sum0b, sum1b), 7); + + out[j] = _mm_packs_epi16(pa, pb); } - } - _mm_empty(); - return psqt; - #elif defined(USE_NEON) +#elif defined(USE_NEON) - constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2); - const int8x8_t Zero = {0}; + constexpr IndexType OutputChunkSize = 128 / 8; + static_assert((HalfDimensions / 2) % OutputChunkSize == 0); + constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; - for (IndexType p = 0; p < 2; ++p) - { - const IndexType offset = HalfDimensions * p; - const auto out = reinterpret_cast(&output[offset]); + const int16x8_t Zero = vdupq_n_s16(0); + const int16x8_t One = vdupq_n_s16(127); - constexpr IndexType UnrollFactor = 16; - static_assert(UnrollFactor % UnrollFactor == 0); - for (IndexType j = 0; j < NumChunks; j += UnrollFactor) + const int16x8_t* in0 = reinterpret_cast(&(accumulation[perspectives[p]][0])); + const int16x8_t* in1 = reinterpret_cast(&(accumulation[perspectives[p]][HalfDimensions / 2])); + int8x16_t* out = reinterpret_cast< int8x16_t*>(output + offset); + + for (IndexType j = 0; j < NumOutputChunks; j += 1) { - int16x8_t sums[UnrollFactor]; - for (IndexType i = 0; i < UnrollFactor; ++i) - sums[i] = reinterpret_cast(accumulation[perspectives[p]])[j+i]; + const int16x8_t sum0a = vmaxq_s16(vminq_s16(in0[j * 2 + 0], One), Zero); + const int16x8_t sum0b = vmaxq_s16(vminq_s16(in0[j * 2 + 1], One), Zero); + const int16x8_t sum1a = vmaxq_s16(vminq_s16(in1[j * 2 + 0], One), Zero); + const int16x8_t sum1b = vmaxq_s16(vminq_s16(in1[j * 2 + 1], One), Zero); + + const int8x8_t pa = vshrn_n_s16(vmulq_s16(sum0a, sum1a), 7); + const int8x8_t pb = vshrn_n_s16(vmulq_s16(sum0b, sum1b), 7); - for (IndexType i = 0; i < UnrollFactor; ++i) - out[j+i] = vmax_s8(vqmovn_s16(sums[i]), Zero); + out[j] = vcombine_s8(pa, pb); } - } - return psqt; - #else +#else - for (IndexType p = 0; p < 2; ++p) - { - const IndexType offset = HalfDimensions * p; - for (IndexType j = 0; j < HalfDimensions; ++j) - { - BiasType sum = accumulation[perspectives[p]][j]; - output[offset + j] = static_cast(std::max(0, std::min(127, sum))); + for (IndexType j = 0; j < HalfDimensions / 2; ++j) { + BiasType sum0 = accumulation[static_cast(perspectives[p])][j + 0]; + BiasType sum1 = accumulation[static_cast(perspectives[p])][j + HalfDimensions / 2]; + sum0 = std::max(0, std::min(127, sum0)); + sum1 = std::max(0, std::min(127, sum1)); + output[offset + j] = static_cast(sum0 * sum1 / 128); } + +#endif } - return psqt; - #endif + return psqt; } // end of function transform()