diff --git a/Readme.md b/Readme.md index a759eff665b..35ff095d679 100644 --- a/Readme.md +++ b/Readme.md @@ -42,7 +42,7 @@ Currently, Stockfish has the following UCI options: this equal to the number of CPU cores available. * #### Hash - The size of the hash table in MB. + The size of the hash table in MB. It is recommended to set Hash after setting Threads. * #### Clear Hash Clear the hash table. @@ -138,6 +138,30 @@ more compact than Nalimov tablebases, while still storing all information needed for optimal play and in addition being able to take into account the 50-move rule. +## Large Pages + +Stockfish supports large pages on Linux and Windows. Large pages make +the hash access more efficient, improving the engine speed, especially +on large hash sizes. Typical increases are 5..10% in terms of nps, but +speed increases up to 30% have been measured. The support is +automatic. Stockfish attempts to use large pages when available and +will fall back to regular memory allocation when this is not the case. + +### Support on Linux + +Large page support on Linux is obtained by the Linux kernel +transparent huge pages functionality. Typically, transparent huge pages +are already enabled and no configuration is needed. + +### Support on Windows + +The use of large pages requires "Lock Pages in Memory" privilege. See +[Enable the Lock Pages in Memory Option (Windows)](https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows) +on how to enable this privilege. Logout/login may be needed +afterwards. Due to memory fragmentation, it may not always be +possible to allocate large pages even when enabled. A reboot +might alleviate this problem. To determine whether large pages +are in use, see the engine log. ## Compiling Stockfish yourself from the sources diff --git a/src/main.cpp b/src/main.cpp index 6eeda66dff8..c7cf2c6f28f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -49,6 +49,7 @@ int main(int argc, char* argv[]) { UCI::loop(argc, argv); + TT.resize(0); Threads.set(0); return 0; } diff --git a/src/misc.cpp b/src/misc.cpp index 946810088da..b1c0feeb9e1 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -309,6 +309,69 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) { return mem; } +#elif defined(_WIN64) + +static void* aligned_ttmem_alloc_large_pages(size_t allocSize) { + + HANDLE hProcessToken { }; + LUID luid { }; + void* mem = nullptr; + + const size_t largePageSize = GetLargePageMinimum(); + if (!largePageSize) + return nullptr; + + // We need SeLockMemoryPrivilege, so try to enable it for the process + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken)) + return nullptr; + + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &luid)) + { + TOKEN_PRIVILEGES tp { }; + TOKEN_PRIVILEGES prevTp { }; + DWORD prevTpLen = 0; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Luid = luid; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds, + // we still need to query GetLastError() to ensure that the privileges were actually obtained... + if (AdjustTokenPrivileges( + hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, &prevTpLen) && + GetLastError() == ERROR_SUCCESS) + { + // round up size to full pages and allocate + allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1); + mem = VirtualAlloc( + NULL, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE); + + // privilege no longer needed, restore previous state + AdjustTokenPrivileges(hProcessToken, FALSE, &prevTp, 0, NULL, NULL); + } + } + + CloseHandle(hProcessToken); + + return mem; +} + +void* aligned_ttmem_alloc(size_t allocSize, void*& mem) { + + // try to allocate large pages + mem = aligned_ttmem_alloc_large_pages(allocSize); + if (mem) + sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl; + else + sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl; + + // fall back to regular, page aligned, allocation if necessary + if (!mem) + mem = VirtualAlloc(NULL, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + + return mem; +} + #else void* aligned_ttmem_alloc(size_t allocSize, void*& mem) { @@ -322,6 +385,28 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) { #endif +/// aligned_ttmem_free will free the previously allocated ttmem +#if defined(_WIN64) + +void aligned_ttmem_free(void* mem) { + + if (!VirtualFree(mem, 0, MEM_RELEASE)) + { + DWORD err = GetLastError(); + std::cerr << "Failed to free transposition table. Error code: 0x" << + std::hex << err << std::dec << std::endl; + exit(EXIT_FAILURE); + } +} + +#else + +void aligned_ttmem_free(void *mem) { + free(mem); +} + +#endif + namespace WinProcGroup { diff --git a/src/misc.h b/src/misc.h index e0e0e98be83..9d53c2dab12 100644 --- a/src/misc.h +++ b/src/misc.h @@ -34,6 +34,7 @@ const std::string compiler_info(); void prefetch(void* addr); void start_logger(const std::string& fname); void* aligned_ttmem_alloc(size_t size, void*& mem); +void aligned_ttmem_free(void* mem); void dbg_hit_on(bool b); void dbg_hit_on(bool c, bool b); diff --git a/src/tt.cpp b/src/tt.cpp index 7e95a2a4e6d..6ee63138d15 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -63,7 +63,14 @@ void TranspositionTable::resize(size_t mbSize) { Threads.main()->wait_for_search_finished(); - free(mem); + if (mem) + aligned_ttmem_free(mem); + + if (!mbSize) + { + mem = nullptr; + return; + } clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster); table = static_cast(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));