Skip to content

Commit

Permalink
Use 128 bit multiply for TT index
Browse files Browse the repository at this point in the history
Remove super cluster stuff from TT and just use a 128 bit multiply.

STC https://tests.stockfishchess.org/tests/view/5ee719b3aae8aec816ab7548
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 12736 W: 2502 L: 2333 D: 7901
Ptnml(0-2): 191, 1452, 2944, 1559, 222

LTC https://tests.stockfishchess.org/tests/view/5ee732d1aae8aec816ab7556
LLR: 2.93 (-2.94,2.94) {-1.50,0.50}
Total: 27584 W: 3431 L: 3350 D: 20803
Ptnml(0-2): 173, 2500, 8400, 2511, 208

Scheme back to being derived from https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/

Also the default optimized version of the index calculation now uses fewer instructions.
https://godbolt.org/z/Tktxbv
Might benefit from mulx (requires -mbmi2)

closes #2744

bench: 4320954
  • Loading branch information
mstembera authored and vondele committed Jun 17, 2020
1 parent 995ee4b commit 1ea488d
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 22 deletions.
13 changes: 13 additions & 0 deletions src/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,19 @@ class PRNG {
{ return T(rand64() & rand64() & rand64()); }
};

inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
#if defined(__GNUC__) && defined(IS_64BIT)
__extension__ typedef unsigned __int128 uint128;
return ((uint128)a * (uint128)b) >> 64;
#else
uint64_t aL = (uint32_t)a, aH = a >> 32;
uint64_t bL = (uint32_t)b, bH = b >> 32;
uint64_t c1 = (aL * bL) >> 32;
uint64_t c2 = aH * bL + c1;
uint64_t c3 = aL * bH + (uint32_t)c2;
return aH * bH + (c2 >> 32) + (c3 >> 32);
#endif
}

/// Under Windows it is not possible for a process to run on more than one
/// logical processor group. This usually means to be limited to use max 64
Expand Down
2 changes: 1 addition & 1 deletion src/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ namespace {
// search to overwrite a previous full search TT value, so we use a different
// position key in case of an excluded move.
excludedMove = ss->excludedMove;
posKey = pos.key() ^ Key(excludedMove << 16); // Isn't a very good hash
posKey = pos.key() ^ (Key(excludedMove) << 48); // Isn't a very good hash
tte = TT.probe(posKey, ttHit);
ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
ttMove = rootNode ? thisThread->rootMoves[thisThread->pvIdx].pv[0]
Expand Down
16 changes: 6 additions & 10 deletions src/tt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,17 @@ TranspositionTable TT; // Our global transposition table
void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev) {

// Preserve any existing move for the same position
if (m || (k >> 48) != key16)
if (m || (uint16_t)k != key16)
move16 = (uint16_t)m;

// Overwrite less valuable entries
if ( (k >> 48) != key16
if ((uint16_t)k != key16
|| d - DEPTH_OFFSET > depth8 - 4
|| b == BOUND_EXACT)
{
assert(d >= DEPTH_OFFSET);

key16 = (uint16_t)(k >> 48);
key16 = (uint16_t)k;
value16 = (int16_t)v;
eval16 = (int16_t)ev;
genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b);
Expand All @@ -65,10 +65,8 @@ void TranspositionTable::resize(size_t mbSize) {

aligned_ttmem_free(mem);

superClusterCount = mbSize * 1024 * 1024 / (sizeof(Cluster) * ClustersPerSuperCluster);

table = static_cast<Cluster*>(
aligned_ttmem_alloc(superClusterCount * ClustersPerSuperCluster * sizeof(Cluster), mem));
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));
if (!mem)
{
std::cerr << "Failed to allocate " << mbSize
Expand All @@ -91,8 +89,6 @@ void TranspositionTable::clear() {
{
threads.emplace_back([this, idx]() {

const size_t clusterCount = superClusterCount * ClustersPerSuperCluster;

// Thread binding gives faster search on systems with a first-touch policy
if (Options["Threads"] > 8)
WinProcGroup::bindThisThread(idx);
Expand Down Expand Up @@ -121,7 +117,7 @@ void TranspositionTable::clear() {
TTEntry* TranspositionTable::probe(const Key key, bool& found) const {

TTEntry* const tte = first_entry(key);
const uint16_t key16 = key >> 48; // Use the high 16 bits as key inside the cluster
const uint16_t key16 = (uint16_t)key; // Use the low 16 bits as key inside the cluster

for (int i = 0; i < ClusterSize; ++i)
if (!tte[i].key16 || tte[i].key16 == key16)
Expand Down
12 changes: 2 additions & 10 deletions src/tt.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ struct TTEntry {
class TranspositionTable {

static constexpr int ClusterSize = 3;
static constexpr int ClustersPerSuperCluster = 256;

struct Cluster {
TTEntry entry[ClusterSize];
Expand All @@ -84,20 +83,13 @@ class TranspositionTable {
void clear();

TTEntry* first_entry(const Key key) const {

// The index is computed from
// Idx = (K48 * SCC) / 2^40, with K48 the 48 lowest bits swizzled.

const uint64_t firstTerm = uint32_t(key) * uint64_t(superClusterCount);
const uint64_t secondTerm = (uint16_t(key >> 32) * uint64_t(superClusterCount)) >> 16;

return &table[(firstTerm + secondTerm) >> 24].entry[0];
return &table[mul_hi64(key, clusterCount)].entry[0];
}

private:
friend struct TTEntry;

size_t superClusterCount;
size_t clusterCount;
Cluster* table;
void* mem;
uint8_t generation8; // Size must be not bigger than TTEntry::genBound8
Expand Down
1 change: 0 additions & 1 deletion src/ucioption.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const

void init(OptionsMap& o) {

// At most 2^32 superclusters. Supercluster = 8 kB
constexpr int MaxHashMB = Is64Bit ? 33554432 : 2048;

o["Debug Log File"] << Option("", on_logger);
Expand Down

0 comments on commit 1ea488d

Please sign in to comment.