From 5001d49f42033bcf36a6c57401f891791031b4d8 Mon Sep 17 00:00:00 2001 From: mstembera Date: Thu, 21 Mar 2024 01:25:59 -0700 Subject: [PATCH] Update nnue_feature_transformer.h Unroll update_accumulator_refresh to process two active indices simultaneously. The compiler might not unroll effectively because the number of active indices isn't known at compile time. STC https://tests.stockfishchess.org/tests/view/65faa8850ec64f0526c4fca9 LLR: 2.93 (-2.94,2.94) <0.00,2.00> Total: 130464 W: 33882 L: 33431 D: 63151 Ptnml(0-2): 539, 14591, 34501, 15082, 519 closes https://github.com/official-stockfish/Stockfish/pull/5125 No functional change --- src/nnue/nnue_feature_transformer.h | 33 +++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index b42f160475f..888edebbdb3 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -619,8 +619,22 @@ class FeatureTransformer { for (IndexType k = 0; k < NumRegs; ++k) acc[k] = biasesTile[k]; - for (const auto index : active) + int i = 0; + for (; i < int(active.size()) - 1; i += 2) { + IndexType index0 = active[i]; + IndexType index1 = active[i + 1]; + const IndexType offset0 = HalfDimensions * index0 + j * TileHeight; + const IndexType offset1 = HalfDimensions * index1 + j * TileHeight; + auto column0 = reinterpret_cast(&weights[offset0]); + auto column1 = reinterpret_cast(&weights[offset1]); + + for (unsigned k = 0; k < NumRegs; ++k) + acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k])); + } + for (; i < int(active.size()); ++i) + { + IndexType index = active[i]; const IndexType offset = HalfDimensions * index + j * TileHeight; auto column = reinterpret_cast(&weights[offset]); @@ -639,8 +653,23 @@ class FeatureTransformer { for (std::size_t k = 0; k < NumPsqtRegs; ++k) psqt[k] = vec_zero_psqt(); - for (const auto index : active) + int i = 0; + for (; i < int(active.size()) - 1; i += 2) + { + IndexType index0 = active[i]; + IndexType index1 = active[i + 1]; + const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight; + const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight; + auto columnPsqt0 = reinterpret_cast(&psqtWeights[offset0]); + auto columnPsqt1 = reinterpret_cast(&psqtWeights[offset1]); + + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + psqt[k] = + vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k])); + } + for (; i < int(active.size()); ++i) { + IndexType index = active[i]; const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; auto columnPsqt = reinterpret_cast(&psqtWeights[offset]);