diff --git a/src/evaluate.h b/src/evaluate.h index 4b3e91acf4c..cc26c945980 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -33,8 +33,8 @@ namespace Eval { // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro or the location where this macro is defined, as it is used // in the Makefile/Fishtest. -#define EvalFileDefaultNameBig "nn-c721dfca8cd3.nnue" -#define EvalFileDefaultNameSmall "nn-baff1ede1f90.nnue" +#define EvalFileDefaultNameBig "nn-ea6ac98d29d4.nnue" +#define EvalFileDefaultNameSmall "nn-321bbccb5bcf.nnue" namespace NNUE { struct Networks; diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 7b7aada312c..dd179a44b30 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -55,14 +55,14 @@ using psqt_vec_t = __m256i; #define vec_store(a, b) _mm512_store_si512(a, b) #define vec_add_16(a, b) _mm512_add_epi16(a, b) #define vec_sub_16(a, b) _mm512_sub_epi16(a, b) - #define vec_mul_16(a, b) _mm512_mullo_epi16(a, b) + #define vec_mulhi_16(a, b) _mm512_mulhi_epi16(a, b) #define vec_zero() _mm512_setzero_epi32() #define vec_set_16(a) _mm512_set1_epi16(a) #define vec_max_16(a, b) _mm512_max_epi16(a, b) #define vec_min_16(a, b) _mm512_min_epi16(a, b) + #define vec_slli_16(a, b) _mm512_slli_epi16(a, b) // Inverse permuted at load time - #define vec_msb_pack_16(a, b) \ - _mm512_packs_epi16(_mm512_srli_epi16(a, 7), _mm512_srli_epi16(b, 7)) + #define vec_packus_16(a, b) _mm512_packus_epi16(a, b) #define vec_load_psqt(a) _mm256_load_si256(a) #define vec_store_psqt(a, b) _mm256_store_si256(a, b) #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b) @@ -78,14 +78,14 @@ using psqt_vec_t = __m256i; #define vec_store(a, b) _mm256_store_si256(a, b) #define vec_add_16(a, b) _mm256_add_epi16(a, b) #define vec_sub_16(a, b) _mm256_sub_epi16(a, b) - #define vec_mul_16(a, b) _mm256_mullo_epi16(a, b) + #define vec_mulhi_16(a, b) _mm256_mulhi_epi16(a, b) #define vec_zero() _mm256_setzero_si256() #define vec_set_16(a) _mm256_set1_epi16(a) #define vec_max_16(a, b) _mm256_max_epi16(a, b) #define vec_min_16(a, b) _mm256_min_epi16(a, b) + #define vec_slli_16(a, b) _mm256_slli_epi16(a, b) // Inverse permuted at load time - #define vec_msb_pack_16(a, b) \ - _mm256_packs_epi16(_mm256_srli_epi16(a, 7), _mm256_srli_epi16(b, 7)) + #define vec_packus_16(a, b) _mm256_packus_epi16(a, b) #define vec_load_psqt(a) _mm256_load_si256(a) #define vec_store_psqt(a, b) _mm256_store_si256(a, b) #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b) @@ -101,12 +101,13 @@ using psqt_vec_t = __m128i; #define vec_store(a, b) *(a) = (b) #define vec_add_16(a, b) _mm_add_epi16(a, b) #define vec_sub_16(a, b) _mm_sub_epi16(a, b) - #define vec_mul_16(a, b) _mm_mullo_epi16(a, b) + #define vec_mulhi_16(a, b) _mm_mulhi_epi16(a, b) #define vec_zero() _mm_setzero_si128() #define vec_set_16(a) _mm_set1_epi16(a) #define vec_max_16(a, b) _mm_max_epi16(a, b) #define vec_min_16(a, b) _mm_min_epi16(a, b) - #define vec_msb_pack_16(a, b) _mm_packs_epi16(_mm_srli_epi16(a, 7), _mm_srli_epi16(b, 7)) + #define vec_slli_16(a, b) _mm_slli_epi16(a, b) + #define vec_packus_16(a, b) _mm_packus_epi16(a, b) #define vec_load_psqt(a) (*(a)) #define vec_store_psqt(a, b) *(a) = (b) #define vec_add_psqt_32(a, b) _mm_add_epi32(a, b) @@ -122,18 +123,14 @@ using psqt_vec_t = int32x4_t; #define vec_store(a, b) *(a) = (b) #define vec_add_16(a, b) vaddq_s16(a, b) #define vec_sub_16(a, b) vsubq_s16(a, b) - #define vec_mul_16(a, b) vmulq_s16(a, b) + #define vec_mulhi_16(a, b) vqdmulhq_s16(a, b) #define vec_zero() \ vec_t { 0 } #define vec_set_16(a) vdupq_n_s16(a) #define vec_max_16(a, b) vmaxq_s16(a, b) #define vec_min_16(a, b) vminq_s16(a, b) -inline vec_t vec_msb_pack_16(vec_t a, vec_t b) { - const int8x8_t shifta = vshrn_n_s16(a, 7); - const int8x8_t shiftb = vshrn_n_s16(b, 7); - const int8x16_t compacted = vcombine_s8(shifta, shiftb); - return *reinterpret_cast(&compacted); -} + #define vec_slli_16(a, b) vshlq_s16(a, vec_set_16(b)) + #define vec_packus_16(a, b) reinterpret_cast(vcombine_u8(vqmovun_s16(a), vqmovun_s16(b))) #define vec_load_psqt(a) (*(a)) #define vec_store_psqt(a, b) *(a) = (b) #define vec_add_psqt_32(a, b) vaddq_s32(a, b) @@ -332,7 +329,7 @@ class FeatureTransformer { constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; const vec_t Zero = vec_zero(); - const vec_t One = vec_set_16(127); + const vec_t One = vec_set_16(254); const vec_t* in0 = reinterpret_cast(&(accumulation[perspectives[p]][0])); const vec_t* in1 = @@ -341,15 +338,30 @@ class FeatureTransformer { for (IndexType j = 0; j < NumOutputChunks; ++j) { - const vec_t sum0a = vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero); - const vec_t sum0b = vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero); - const vec_t sum1a = vec_max_16(vec_min_16(in1[j * 2 + 0], One), Zero); - const vec_t sum1b = vec_max_16(vec_min_16(in1[j * 2 + 1], One), Zero); + // What we want to do is multiply inputs in a pairwise manner (after clipping), and then shift right by 9. + // Instead, we shift left by 7, and use mulhi, stripping the bottom 16 bits, effectively shifting right by 16, + // resulting in a net shift of 9 bits. We use mulhi because it maintains the sign of the multiplication (unlike mullo), + // allowing us to make use of packus to clip 2 of the inputs, resulting in a save of 2 "vec_max_16" calls. + // A special case is when we use NEON, where we shift left by 6 instead, because the instruction "vqdmulhq_s16" + // also doubles the return value after the multiplication, adding an extra shift to the left by 1, so we + // compensate by shifting less before the multiplication. + + #if defined(USE_SSE2) + constexpr int shift = 7; + #else + constexpr int shift = 6; + #endif + const vec_t sum0a = + vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero), shift); + const vec_t sum0b = + vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero), shift); + const vec_t sum1a = vec_min_16(in1[j * 2 + 0], One); + const vec_t sum1b = vec_min_16(in1[j * 2 + 1], One); - const vec_t pa = vec_mul_16(sum0a, sum1a); - const vec_t pb = vec_mul_16(sum0b, sum1b); + const vec_t pa = vec_mulhi_16(sum0a, sum1a); + const vec_t pb = vec_mulhi_16(sum0b, sum1b); - out[j] = vec_msb_pack_16(pa, pb); + out[j] = vec_packus_16(pa, pb); } #else @@ -359,9 +371,9 @@ class FeatureTransformer { BiasType sum0 = accumulation[static_cast(perspectives[p])][j + 0]; BiasType sum1 = accumulation[static_cast(perspectives[p])][j + HalfDimensions / 2]; - sum0 = std::clamp(sum0, 0, 127); - sum1 = std::clamp(sum1, 0, 127); - output[offset + j] = static_cast(unsigned(sum0 * sum1) / 128); + sum0 = std::clamp(sum0, 0, 254); + sum1 = std::clamp(sum1, 0, 254); + output[offset + j] = static_cast(unsigned(sum0 * sum1) / 512); } #endif diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp index a13c717c3d8..b54bbaba3dd 100644 --- a/src/nnue/nnue_misc.cpp +++ b/src/nnue/nnue_misc.cpp @@ -178,14 +178,11 @@ trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::Accumulat ss << "| " << bucket << " "; ss << " | "; format_cp_aligned_dot(t.psqt[bucket], ss, pos); - ss << " " - << " | "; + ss << " " << " | "; format_cp_aligned_dot(t.positional[bucket], ss, pos); - ss << " " - << " | "; + ss << " " << " | "; format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], ss, pos); - ss << " " - << " |"; + ss << " " << " |"; if (bucket == t.correctBucket) ss << " <-- this bucket is used"; ss << '\n'; diff --git a/src/tune.cpp b/src/tune.cpp index 3e5ebe5e6c3..84f59524fb4 100644 --- a/src/tune.cpp +++ b/src/tune.cpp @@ -59,8 +59,7 @@ void make_option(OptionsMap* options, const string& n, int v, const SetRange& r) // Print formatted parameters, ready to be copy-pasted in Fishtest std::cout << n << "," << v << "," << r(v).first << "," << r(v).second << "," - << (r(v).second - r(v).first) / 20.0 << "," - << "0.0020" << std::endl; + << (r(v).second - r(v).first) / 20.0 << "," << "0.0020" << std::endl; } } @@ -118,7 +117,6 @@ void Tune::Entry::read_option() { namespace Stockfish { -void Tune::read_results() { /* ...insert your values here... */ -} +void Tune::read_results() { /* ...insert your values here... */ } } // namespace Stockfish diff --git a/src/uci.cpp b/src/uci.cpp index cb686a027db..cb9d7b08556 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -286,9 +286,9 @@ void UCIEngine::bench(std::istream& args) { dbg_print(); - std::cerr << "\n===========================" - << "\nTotal time (ms) : " << elapsed << "\nNodes searched : " << nodes - << "\nNodes/second : " << 1000 * nodes / elapsed << std::endl; + std::cerr << "\n===========================" << "\nTotal time (ms) : " << elapsed + << "\nNodes searched : " << nodes << "\nNodes/second : " << 1000 * nodes / elapsed + << std::endl; // reset callback, to not capture a dangling reference to nodesSearched engine.set_on_update_full([&](const auto& i) { on_update_full(i, options["UCI_ShowWDL"]); });