From 6583ee5d3c991a16035464c2e298d1d086c72278 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Fri, 20 May 2022 17:58:16 +0800 Subject: [PATCH 01/25] MVP version of Data compress (#1) --- AnnService/CoreLibrary.vcxproj | 14 +- AnnService/CoreLibrary.vcxproj.filters | 3 + AnnService/SSDServing.vcxproj | 12 +- AnnService/inc/Core/SPANN/Compressor.h | 64 +++++ .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 239 ++++++++++++++---- AnnService/inc/Core/SPANN/IExtraSearcher.h | 5 +- AnnService/inc/Core/SPANN/Options.h | 1 + .../inc/Core/SPANN/ParameterDefinitionList.h | 1 + AnnService/src/Core/SPANN/SPANNIndex.cpp | 4 +- 9 files changed, 288 insertions(+), 55 deletions(-) create mode 100644 AnnService/inc/Core/SPANN/Compressor.h diff --git a/AnnService/CoreLibrary.vcxproj b/AnnService/CoreLibrary.vcxproj index 386c0f38..78d967ae 100644 --- a/AnnService/CoreLibrary.vcxproj +++ b/AnnService/CoreLibrary.vcxproj @@ -99,7 +99,11 @@ ProgramDatabase /Zc:twoPhase- /Zc:__cplusplus %(AdditionalOptions) stdcpp17 + C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\include + + C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\lib\zstd.lib + @@ -127,12 +131,19 @@ true _MBCS;_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) /Zc:twoPhase- /Zc:__cplusplus %(AdditionalOptions) - stdcpp17 + Default + C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\include true true + + C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\lib\zstd.lib + + + XCOPY C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\bin "$(TargetDir)" /D /K /Y + @@ -160,6 +171,7 @@ + diff --git a/AnnService/CoreLibrary.vcxproj.filters b/AnnService/CoreLibrary.vcxproj.filters index 453a4795..f260b078 100644 --- a/AnnService/CoreLibrary.vcxproj.filters +++ b/AnnService/CoreLibrary.vcxproj.filters @@ -214,6 +214,9 @@ Header Files\Core\Common + + Header Files\Core\SPANN + diff --git a/AnnService/SSDServing.vcxproj b/AnnService/SSDServing.vcxproj index bf37ca74..29815d13 100644 --- a/AnnService/SSDServing.vcxproj +++ b/AnnService/SSDServing.vcxproj @@ -54,7 +54,7 @@ MultiByte - StaticLibrary + Application false v142 true @@ -152,12 +152,13 @@ true _$(OutputType);_MBCS;_SCL_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true - %(AdditionalIncludeDirectories) + C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\include;%(AdditionalIncludeDirectories) NotUsing inc/SSDServing/Common/stdafx.h true /Zc:twoPhase- %(AdditionalOptions) - stdcpp17 + Default + Disabled Console @@ -165,8 +166,11 @@ true true %(AdditionalLibraryDirectories) - CoreLibrary.lib;%(AdditionalDependencies) + CoreLibrary.lib;C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\lib\zstd.lib;%(AdditionalDependencies) + + XCOPY C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\bin "$(TargetDir)" /D /K /Y + diff --git a/AnnService/inc/Core/SPANN/Compressor.h b/AnnService/inc/Core/SPANN/Compressor.h new file mode 100644 index 00000000..59952fc1 --- /dev/null +++ b/AnnService/inc/Core/SPANN/Compressor.h @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_SPANN_COMPRESSOR_H_ +#define _SPTAG_SPANN_COMPRESSOR_H_ + +#include +#include "zstd.h" +#include "../Common.h" + +namespace SPTAG { + namespace SPANN { + class Compressor + { + public: + Compressor() + { + compress_level = 1; + } + + virtual ~Compressor(){} + + std::string Compress(const std::string &src) + { + size_t est_comp_size = ZSTD_compressBound(src.size()); + std::string buffer{}; + buffer.resize(est_comp_size); // TODO: reuse buffer + size_t compressed_size = ZSTD_compress((void*)buffer.data(), est_comp_size, + src.data(), src.size(), 1); // TODO: change compress level + buffer.resize(compressed_size); + buffer.shrink_to_fit(); + + return buffer; + } + + std::string Decompress(const char * src, size_t srcSize) + { + size_t est_decomp_size = ZSTD_getDecompressedSize(src, srcSize); + std::string dst{}; + dst.resize(est_decomp_size); + size_t const decomp_size = ZSTD_decompress( + (void*)dst.data(), est_decomp_size, src, srcSize); + dst.resize(decomp_size); + dst.shrink_to_fit(); + + return dst; + } + + // return the compressed sie + size_t GetCompressedSize(const std::string &src) + { + std::string dst = Compress(src); + return dst.size(); + } + + private: + int compress_level; + //std::string buffer{}; + std::string dictionary{}; + }; + } // SPANN +} // SPTAG + +#endif // _SPTAG_SPANN_COMPRESSOR_H_ diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 0afb0eb7..bfdcf49a 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -8,11 +8,13 @@ #include "inc/Helper/AsyncFileReader.h" #include "IExtraSearcher.h" #include "../Common/TruthSet.h" +#include "Compressor.h" #include #include #include #include +#include namespace SPTAG { @@ -79,8 +81,8 @@ namespace SPTAG } }; -#define ProcessPosting(vectorInfoSize) \ - for (char *vectorInfo = buffer + listInfo->pageOffset, *vectorInfoEnd = vectorInfo + listInfo->listEleCount * vectorInfoSize; vectorInfo < vectorInfoEnd; vectorInfo += vectorInfoSize) { \ +#define ProcessPosting(p_postingListFullData, vectorInfoSize) \ + for (char *vectorInfo = p_postingListFullData, *vectorInfoEnd = vectorInfo + listInfo->listEleCount * vectorInfoSize; vectorInfo < vectorInfoEnd; vectorInfo += vectorInfoSize) { \ int vectorID = *(reinterpret_cast(vectorInfo)); \ if (p_exWorkSpace->m_deduper.CheckAndSet(vectorID)) continue; \ auto distance2leaf = p_index->ComputeDistance(queryResults.GetQuantizedTarget(), vectorInfo + sizeof(int)); \ @@ -136,7 +138,9 @@ namespace SPTAG virtual void SearchIndex(ExtraWorkSpace* p_exWorkSpace, QueryResult& p_queryResults, std::shared_ptr p_index, - SearchStats* p_stats, std::set* truth, std::map>* found) + SearchStats* p_stats, + bool m_enableDataCompression, + std::set* truth, std::map>* found) { const uint32_t postingListCount = static_cast(p_exWorkSpace->m_postingIDs.size()); @@ -187,11 +191,32 @@ namespace SPTAG #ifdef BATCH_READ // async batch read auto vectorInfoSize = m_vectorInfoSize; - request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize](Helper::AsyncReadRequest* request) + std::shared_ptr m_pCompressor; //TODO: reuse + + request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, m_enableDataCompression, m_pCompressor](Helper::AsyncReadRequest* request) { char* buffer = request->m_buffer; ListInfo* listInfo = (ListInfo*)(request->m_payload); - ProcessPosting(vectorInfoSize) + + // decompress posting list + char* p_postingListFullData; + std::string postingListFullData = ""; + if (m_enableDataCompression) + { + postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes); + if (postingListFullData.size() != listInfo->listEleCount * vectorInfoSize) + { + LOG(Helper::LogLevel::LL_Info, "postingListFullData size not match! %zu, %d, \n", postingListFullData.size(), listInfo->listEleCount * vectorInfoSize); + exit(1); + } + p_postingListFullData = const_cast(postingListFullData.c_str()); + } + else + { + p_postingListFullData = buffer + listInfo->pageOffset; + } + + ProcessPosting(p_postingListFullData, vectorInfoSize); }; #else // async read request.m_callback = [&p_exWorkSpace](Helper::AsyncReadRequest* request) @@ -232,7 +257,7 @@ namespace SPTAG } #endif #endif - if (truth) { + if (truth) { // TODO: check truth for (uint32_t pi = 0; pi < postingListCount; ++pi) { auto curPostingID = p_exWorkSpace->m_postingIDs[pi]; @@ -240,9 +265,23 @@ namespace SPTAG ListInfo* listInfo = &(m_listInfos[curPostingID / m_listPerFile][curPostingID % m_listPerFile]); char* buffer = (char*)((p_exWorkSpace->m_pageBuffers[pi]).GetBuffer()); - for (int i = 0; i < listInfo->listEleCount; ++i) { - char* vectorInfo = buffer + listInfo->pageOffset + i * m_vectorInfoSize; + char* p_postingListFullData; + if (m_enableDataCompression) + { + std::string postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes); + p_postingListFullData = const_cast(postingListFullData.c_str()); + } + else + { + p_postingListFullData = buffer + listInfo->pageOffset; + } + + for (size_t i = 0; i < listInfo->listEleCount; ++i) { + char* vectorInfo = p_postingListFullData + i * m_vectorInfoSize; int vectorID = *(reinterpret_cast(vectorInfo)); + + LOG(Helper::LogLevel::LL_Info, "vectorID: %d\n", vectorID); + if (truth && truth->count(vectorID)) (*found)[curPostingID].insert(vectorID); } } @@ -256,6 +295,27 @@ namespace SPTAG } } + std::string GetPostingListFullData(int indexPostingList, + size_t p_postingListSize, + Selection& p_selections, + std::shared_ptr p_fullVectors) + { + std::string postingListFullData = ""; + size_t selectIdx = p_selections.lower_bound(indexPostingList); + // iterate over all the vectors in the posting list + for (int i = 0; i < p_postingListSize; ++i) + { + if (p_selections[selectIdx].node != indexPostingList) + { + LOG(Helper::LogLevel::LL_Error, "Selection ID NOT MATCH! node:%d offset:%zu\n", indexPostingList, selectIdx); + exit(1); + } + int vid = p_selections[selectIdx++].tonode; + postingListFullData.append(reinterpret_cast(&vid), sizeof(int)); + postingListFullData.append(reinterpret_cast(p_fullVectors->GetVector(vid)), p_fullVectors->PerVectorDataSize()); + } + return postingListFullData; + } bool BuildIndex(std::shared_ptr& p_reader, std::shared_ptr p_headIndex, Options& p_opt) { std::string outputFile = p_opt.m_indexDirectory + FolderSep + p_opt.m_ssdIndex; @@ -267,7 +327,7 @@ namespace SPTAG int numThreads = p_opt.m_iSSDNumberOfThreads; int candidateNum = p_opt.m_internalResultNum; - + // get headVectorIDs std::unordered_set headVectorIDS; if (p_opt.m_headIDFile.empty()) { LOG(Helper::LogLevel::LL_Error, "Not found VectorIDTranslate!\n"); @@ -444,6 +504,7 @@ namespace SPTAG auto t4 = std::chrono::high_resolution_clock::now(); LOG(SPTAG::Helper::LogLevel::LL_Info, "Time to perform posting cut:%.2lf sec.\n", ((double)std::chrono::duration_cast(t4 - t3).count()) + ((double)std::chrono::duration_cast(t4 - t3).count()) / 1000); + // number of posting lists per file size_t postingFileSize = (postingListSize.size() + p_opt.m_ssdIndexFileNum - 1) / p_opt.m_ssdIndexFileNum; std::vector selectionsBatchOffset(p_opt.m_ssdIndexFileNum + 1, 0); for (int i = 0; i < p_opt.m_ssdIndexFileNum; i++) { @@ -455,24 +516,69 @@ namespace SPTAG auto fullVectors = p_reader->GetVectorSet(); if (p_opt.m_distCalcMethod == DistCalcMethod::Cosine && !p_reader->IsNormalized() && !p_headIndex->m_pQuantizer) fullVectors->Normalize(p_opt.m_iSSDNumberOfThreads); - + + // get compressed size of each posting list + LOG(Helper::LogLevel::LL_Info, "m_enableDataCompression: %s\n", p_opt.m_enableDataCompression ? "true" : "false"); + std::vector postingListBytes(headVectorIDS.size()); + if (p_opt.m_enableDataCompression) + { + LOG(Helper::LogLevel::LL_Info, "Getting compressed size of each posting list...\n"); + // TODO: omp parallel + for (int i = 0; i < postingListSize.size(); i++) { + // do not compress if no data + if (postingListSize[i] == 0) { + postingListBytes[i] = 0; + continue; + } + std::string postingListFullData = GetPostingListFullData(i, postingListSize[i], selections, fullVectors); + size_t sizeToCompress = postingListSize[i] * vectorInfoSize; + if (sizeToCompress != postingListFullData.size()) { + LOG(Helper::LogLevel::LL_Error, "Size to compress NOT MATCH! PostingListFullData size: %zu sizeToCompress: %zu \n", postingListFullData.size(), sizeToCompress); + } + postingListBytes[i] = m_pCompressor->GetCompressedSize(postingListFullData); + if (i % 10000 == 0) { + LOG(Helper::LogLevel::LL_Info, "Posting list %d/%d, compressed size: %d, compression ratio: %.4f\n", i, postingListSize.size(), postingListBytes[i], postingListBytes[i] / float(sizeToCompress)); + } + } + LOG(Helper::LogLevel::LL_Info, "Getted compressed size for all the %d posting lists.\n", postingListBytes.size()); + LOG(Helper::LogLevel::LL_Info, "Mean compressed size: %.4f \n", std::accumulate(postingListBytes.begin(), postingListBytes.end(), 0.0) / postingListBytes.size()); + LOG(Helper::LogLevel::LL_Info, "Mean compression ratio: %.4f \n", std::accumulate(postingListBytes.begin(), postingListBytes.end(), 0.0) / (std::accumulate(postingListSize.begin(), postingListSize.end(), 0.0) * vectorInfoSize)); + } + else + { + for (int i = 0; i < postingListSize.size(); i++) + { + postingListBytes[i] = postingListSize[i] * vectorInfoSize; + } + } + + // iterate over files for (int i = 0; i < p_opt.m_ssdIndexFileNum; i++) { + // postingFileSize: number of posting lists in the file + // postingListSize: number of vectors in the posting list, type vector size_t curPostingListOffSet = i * postingFileSize; size_t curPostingListEnd = min(postingListSize.size(), (i + 1) * postingFileSize); std::vector curPostingListSizes( postingListSize.begin() + curPostingListOffSet, postingListSize.begin() + curPostingListEnd); + std::vector curPostingListBytes; + curPostingListBytes.assign( + postingListBytes.begin() + curPostingListOffSet, + postingListBytes.begin() + curPostingListEnd); std::unique_ptr postPageNum; std::unique_ptr postPageOffset; std::vector postingOrderInIndex; - SelectPostingOffset(vectorInfoSize, curPostingListSizes, postPageNum, postPageOffset, postingOrderInIndex); + SelectPostingOffset(curPostingListBytes, postPageNum, postPageOffset, postingOrderInIndex); + // LoadBatch: select vectors for each posting list if (p_opt.m_ssdIndexFileNum > 1) selections.LoadBatch(selectionsBatchOffset[i], selectionsBatchOffset[i + 1]); - + // write one file OutputSSDIndexFile((i == 0) ? outputFile : outputFile + "_" + std::to_string(i), + p_opt.m_enableDataCompression, vectorInfoSize, curPostingListSizes, + curPostingListBytes, selections, postPageNum, postPageOffset, @@ -507,6 +613,8 @@ namespace SPTAG private: struct ListInfo { + std::size_t listTotalBytes = 0; + int listEleCount = 0; std::uint16_t listPageCount = 0; @@ -563,6 +671,10 @@ namespace SPTAG int pageNum; for (int i = 0; i < m_listCount; ++i) { + if (ptr->ReadBinary(sizeof(m_listInfos[i].listTotalBytes), reinterpret_cast(&(m_listInfos[i].listTotalBytes))) != sizeof(m_listInfos[i].listTotalBytes)) { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } if (ptr->ReadBinary(sizeof(pageNum), reinterpret_cast(&(pageNum))) != sizeof(pageNum)) { LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); @@ -581,8 +693,8 @@ namespace SPTAG } m_listInfos[i].listOffset = (static_cast(m_listPageOffset + pageNum) << PageSizeEx); - m_listInfos[i].listEleCount = min(m_listInfos[i].listEleCount, (min(static_cast(m_listInfos[i].listPageCount), p_postingPageLimit) << PageSizeEx) / m_vectorInfoSize); - m_listInfos[i].listPageCount = static_cast(ceil((m_vectorInfoSize * m_listInfos[i].listEleCount + m_listInfos[i].pageOffset) * 1.0 / (1 << PageSizeEx))); + // m_listInfos[i].listEleCount = min(m_listInfos[i].listEleCount, (min(static_cast(m_listInfos[i].listPageCount), p_postingPageLimit) << PageSizeEx) / m_vectorInfoSize); + // m_listInfos[i].listPageCount = static_cast(ceil((m_vectorInfoSize * m_listInfos[i].listEleCount + m_listInfos[i].pageOffset) * 1.0 / (1 << PageSizeEx))); totalListElementCount += m_listInfos[i].listEleCount; int pageCount = m_listInfos[i].listPageCount; @@ -609,7 +721,6 @@ namespace SPTAG m_iDataDimension, m_listPageOffset); - LOG(Helper::LogLevel::LL_Info, "Big page (>4K): list count %zu, total element count %zu.\n", biglistCount, @@ -625,14 +736,14 @@ namespace SPTAG return m_listCount; } - void SelectPostingOffset(size_t p_spacePerVector, - const std::vector& p_postingListSizes, + void SelectPostingOffset( + const std::vector& p_postingListBytes, std::unique_ptr& p_postPageNum, std::unique_ptr& p_postPageOffset, std::vector& p_postingOrderInIndex) { - p_postPageNum.reset(new int[p_postingListSizes.size()]); - p_postPageOffset.reset(new std::uint16_t[p_postingListSizes.size()]); + p_postPageNum.reset(new int[p_postingListBytes.size()]); + p_postPageOffset.reset(new std::uint16_t[p_postingListBytes.size()]); struct PageModWithID { @@ -652,18 +763,19 @@ namespace SPTAG std::set listRestSize; p_postingOrderInIndex.clear(); - p_postingOrderInIndex.reserve(p_postingListSizes.size()); + p_postingOrderInIndex.reserve(p_postingListBytes.size()); PageModWithID listInfo; - for (size_t i = 0; i < p_postingListSizes.size(); ++i) + for (size_t i = 0; i < p_postingListBytes.size(); ++i) { - if (p_postingListSizes[i] == 0) + if (p_postingListBytes[i] == 0) { continue; } listInfo.id = static_cast(i); - listInfo.rest = static_cast((p_spacePerVector * p_postingListSizes[i]) % PageSize); + size_t postingListByte = p_postingListBytes[i]; + listInfo.rest = static_cast(postingListByte % PageSize); listRestSize.insert(listInfo); } @@ -676,7 +788,7 @@ namespace SPTAG while (!listRestSize.empty()) { listInfo.rest = PageSize - currOffset; - auto iter = listRestSize.lower_bound(listInfo); + auto iter = listRestSize.lower_bound(listInfo); // avoid page-crossing if (iter == listRestSize.end()) { ++currPageNum; @@ -702,7 +814,7 @@ namespace SPTAG currOffset = 0; } - currPageNum += static_cast((p_spacePerVector * p_postingListSizes[iter->id]) / PageSize); + currPageNum += static_cast(p_postingListBytes[iter->id] / PageSize); listRestSize.erase(iter); } @@ -713,8 +825,10 @@ namespace SPTAG void OutputSSDIndexFile(const std::string& p_outputFile, + bool p_enableDataCompression, size_t p_spacePerVector, const std::vector& p_postingListSizes, + const std::vector& p_postingListBytes, Selection& p_postingSelections, const std::unique_ptr& p_postPageNum, const std::unique_ptr& p_postPageOffset, @@ -728,9 +842,10 @@ namespace SPTAG auto ptr = SPTAG::f_createIO(); int retry = 3; + // open file while (retry > 0 && (ptr == nullptr || !ptr->Initialize(p_outputFile.c_str(), std::ios::binary | std::ios::out))) { - LOG(Helper::LogLevel::LL_Error, "Failed open file %s\n", p_outputFile.c_str()); + LOG(Helper::LogLevel::LL_Error, "Failed open file %s, retrying...\n", p_outputFile.c_str()); retry--; } @@ -738,13 +853,14 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Error, "Failed open file %s\n", p_outputFile.c_str()); exit(1); } - + // meta size of global info std::uint64_t listOffset = sizeof(int) * 4; - listOffset += (sizeof(int) + sizeof(std::uint16_t) + sizeof(int) + sizeof(std::uint16_t)) * p_postingListSizes.size(); + // meta size of the posting lists + listOffset += (sizeof(size_t) + sizeof(int) + sizeof(std::uint16_t) + sizeof(int) + sizeof(std::uint16_t)) * p_postingListSizes.size(); std::unique_ptr paddingVals(new char[PageSize]); memset(paddingVals.get(), 0, sizeof(char) * PageSize); - + // paddingSize: bytes left in the last page std::uint64_t paddingSize = PageSize - (listOffset % PageSize); if (paddingSize == PageSize) { @@ -755,37 +871,39 @@ namespace SPTAG listOffset += paddingSize; } - // Number of lists. + // Number of posting lists int i32Val = static_cast(p_postingListSizes.size()); if (ptr->WriteBinary(sizeof(i32Val), reinterpret_cast(&i32Val)) != sizeof(i32Val)) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } - // Number of all documents. + // Number of vectors i32Val = static_cast(p_fullVectors->Count()); if (ptr->WriteBinary(sizeof(i32Val), reinterpret_cast(&i32Val)) != sizeof(i32Val)) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } - // Bytes of each vector. + // Vector dimension i32Val = static_cast(p_fullVectors->Dimension()); if (ptr->WriteBinary(sizeof(i32Val), reinterpret_cast(&i32Val)) != sizeof(i32Val)) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } - // Page offset of list content section. + // Page offset of list content section i32Val = static_cast(listOffset / PageSize); if (ptr->WriteBinary(sizeof(i32Val), reinterpret_cast(&i32Val)) != sizeof(i32Val)) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } + // Meta of each posting list for (int i = 0; i < p_postingListSizes.size(); ++i) { - int pageNum = 0; + size_t postingListByte = 0; + int pageNum = 0; // starting page number std::uint16_t pageOffset = 0; int listEleCount = 0; std::uint16_t listPageCount = 0; @@ -795,30 +913,41 @@ namespace SPTAG pageNum = p_postPageNum[i]; pageOffset = static_cast(p_postPageOffset[i]); listEleCount = static_cast(p_postingListSizes[i]); - listPageCount = static_cast((p_spacePerVector * p_postingListSizes[i]) / PageSize); - if (0 != ((p_spacePerVector * p_postingListSizes[i]) % PageSize)) + postingListByte = p_postingListBytes[i]; + listPageCount = static_cast(postingListByte / PageSize); + if (0 != (postingListByte % PageSize)) { ++listPageCount; } } + // Total bytes of the posting list + if (ptr->WriteBinary(sizeof(postingListByte), reinterpret_cast(&postingListByte)) != sizeof(postingListByte)) { + LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); + exit(1); + } + // Page number of the posting list if (ptr->WriteBinary(sizeof(pageNum), reinterpret_cast(&pageNum)) != sizeof(pageNum)) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } + // Page offset if (ptr->WriteBinary(sizeof(pageOffset), reinterpret_cast(&pageOffset)) != sizeof(pageOffset)) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } + // Number of vectors in the posting list if (ptr->WriteBinary(sizeof(listEleCount), reinterpret_cast(&listEleCount)) != sizeof(listEleCount)) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } + // Page count of the posting list if (ptr->WriteBinary(sizeof(listPageCount), reinterpret_cast(&listPageCount)) != sizeof(listPageCount)) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } } + // Write padding vals if (paddingSize > 0) { if (ptr->WriteBinary(paddingSize, reinterpret_cast(paddingVals.get())) != paddingSize) { @@ -838,6 +967,7 @@ namespace SPTAG listOffset = 0; std::uint64_t paddedSize = 0; + // iterate over all the posting lists for (auto id : p_postingOrderInIndex) { std::uint64_t targetOffset = static_cast(p_postPageNum[id]) * PageSize + p_postPageOffset[id]; @@ -846,7 +976,7 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Info, "List offset not match, targetOffset < listOffset!\n"); exit(1); } - + // write padding vals before the posting list if (targetOffset > listOffset) { if (targetOffset - listOffset > PageSize) @@ -865,25 +995,38 @@ namespace SPTAG listOffset = targetOffset; } - std::size_t selectIdx = p_postingSelections.lower_bound(id + (int)p_postingListOffset); - for (int j = 0; j < p_postingListSizes[id]; ++j) - { - if (p_postingSelections[selectIdx].node != id + (int)p_postingListOffset) + if (p_postingListSizes[id]==0) { + continue; + } + int indexPostingList = id + (int)p_postingListOffset; + // get posting list full content and write it at once + std::string postingListFullData = GetPostingListFullData(indexPostingList, p_postingListSizes[id], p_postingSelections, p_fullVectors); + size_t postingListFullSize = p_postingListSizes[indexPostingList] * p_spacePerVector; + if (postingListFullSize != postingListFullData.size()) { + LOG(Helper::LogLevel::LL_Error, "posting list full data size NOT MATCH! postingListFullData.size(): %zu postingListFullSize: %zu \n", postingListFullData.size(), postingListFullSize); + exit(1); + } + if (p_enableDataCompression) { + std::string compressedData = m_pCompressor->Compress(postingListFullData); + size_t compressedSize = compressedData.size(); + if (compressedSize != p_postingListBytes[indexPostingList]) { - LOG(Helper::LogLevel::LL_Error, "Selection ID NOT MATCH! node:%d offset:%zu\n", id + (int)p_postingListOffset, selectIdx); + LOG(Helper::LogLevel::LL_Error, "Compressed size NOT MATCH! compressed size:%zu, pre-calculated compressed size:%zu\n", compressedSize, p_postingListBytes[indexPostingList]); exit(1); } - - i32Val = p_postingSelections[selectIdx++].tonode; - if (ptr->WriteBinary(sizeof(i32Val), reinterpret_cast(&i32Val)) != sizeof(i32Val)) { + if (ptr->WriteBinary(compressedSize, compressedData.data()) != compressedSize) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } - if (ptr->WriteBinary(p_fullVectors->PerVectorDataSize(), reinterpret_cast(p_fullVectors->GetVector(i32Val))) != p_fullVectors->PerVectorDataSize()) { + listOffset += compressedSize; + } + else + { + if (ptr->WriteBinary(postingListFullSize, postingListFullData.data()) != postingListFullSize) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } - listOffset += p_spacePerVector; + listOffset += postingListFullSize; } } @@ -921,6 +1064,8 @@ namespace SPTAG std::vector> m_indexFiles; + std::unique_ptr m_pCompressor; // TOOD: not initialized + int m_vectorInfoSize = 0; int m_totalListCount = 0; diff --git a/AnnService/inc/Core/SPANN/IExtraSearcher.h b/AnnService/inc/Core/SPANN/IExtraSearcher.h index 3063d8cc..0cf20ebb 100644 --- a/AnnService/inc/Core/SPANN/IExtraSearcher.h +++ b/AnnService/inc/Core/SPANN/IExtraSearcher.h @@ -165,7 +165,10 @@ namespace SPTAG { virtual void SearchIndex(ExtraWorkSpace* p_exWorkSpace, QueryResult& p_queryResults, std::shared_ptr p_index, - SearchStats* p_stats, std::set* truth = nullptr, std::map>* found = nullptr) = 0; + SearchStats* p_stats, + bool m_enableDataCompression = false, + std::set* truth = nullptr, + std::map>* found = nullptr) = 0; virtual bool BuildIndex(std::shared_ptr& p_reader, std::shared_ptr p_index, diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index 133d51a4..071c27d2 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -79,6 +79,7 @@ namespace SPTAG { bool m_enableSSD; bool m_buildSsdIndex; int m_iSSDNumberOfThreads; + bool m_enableDataCompression; // Building int m_replicaCount; diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index 72f38fe1..b00d234f 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -77,6 +77,7 @@ DefineBuildHeadParameter(m_buildHead, bool, false, "isExecute") DefineSSDParameter(m_enableSSD, bool, false, "isExecute") DefineSSDParameter(m_buildSsdIndex, bool, false, "BuildSsdIndex") DefineSSDParameter(m_iSSDNumberOfThreads, int, 16, "NumberOfThreads") +DefineSSDParameter(m_enableDataCompression, bool, false, "EnableDataCompression") // Building DefineSSDParameter(m_internalResultNum, int, 64, "InternalResultNum") diff --git a/AnnService/src/Core/SPANN/SPANNIndex.cpp b/AnnService/src/Core/SPANN/SPANNIndex.cpp index 96d652b0..7b880d31 100644 --- a/AnnService/src/Core/SPANN/SPANNIndex.cpp +++ b/AnnService/src/Core/SPANN/SPANNIndex.cpp @@ -229,7 +229,7 @@ namespace SPTAG } p_queryResults->Reverse(); - m_extraSearcher->SearchIndex(workSpace.get(), *p_queryResults, m_index, nullptr); + m_extraSearcher->SearchIndex(workSpace.get(), *p_queryResults, m_index, nullptr, m_options.m_enableDataCompression); p_queryResults->SortResult(); m_workSpacePool->Return(workSpace); } @@ -286,7 +286,7 @@ namespace SPTAG auto_ws->m_postingIDs.emplace_back(res->VID); } - m_extraSearcher->SearchIndex(auto_ws.get(), newResults, m_index, p_stats, truth, found); + m_extraSearcher->SearchIndex(auto_ws.get(), newResults, m_index, p_stats, m_options.m_enableDataCompression, truth, found); } m_workSpacePool->Return(auto_ws); From f61c390a54c4ec8c8c4492388c9d98c45c82e2be Mon Sep 17 00:00:00 2001 From: Guoxin Date: Tue, 24 May 2022 09:24:37 +0800 Subject: [PATCH 02/25] Support delta-encoding (#2) --- AnnService/CoreLibrary.vcxproj | 2 +- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 73 +++++++++++++++---- AnnService/inc/Core/SPANN/IExtraSearcher.h | 1 + AnnService/inc/Core/SPANN/Options.h | 1 + .../inc/Core/SPANN/ParameterDefinitionList.h | 1 + AnnService/src/Core/SPANN/SPANNIndex.cpp | 4 +- 6 files changed, 63 insertions(+), 19 deletions(-) diff --git a/AnnService/CoreLibrary.vcxproj b/AnnService/CoreLibrary.vcxproj index 78d967ae..e562ee8e 100644 --- a/AnnService/CoreLibrary.vcxproj +++ b/AnnService/CoreLibrary.vcxproj @@ -123,7 +123,7 @@ Level3 - MaxSpeed + Disabled true true true diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index bfdcf49a..a7e874a8 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -139,6 +139,7 @@ namespace SPTAG QueryResult& p_queryResults, std::shared_ptr p_index, SearchStats* p_stats, + bool m_enableDeltaEncoding, bool m_enableDataCompression, std::set* truth, std::map>* found) { @@ -192,8 +193,8 @@ namespace SPTAG #ifdef BATCH_READ // async batch read auto vectorInfoSize = m_vectorInfoSize; std::shared_ptr m_pCompressor; //TODO: reuse - - request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, m_enableDataCompression, m_pCompressor](Helper::AsyncReadRequest* request) + + request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, m_enableDeltaEncoding, curPostingID, m_enableDataCompression, m_pCompressor](Helper::AsyncReadRequest* request) { char* buffer = request->m_buffer; ListInfo* listInfo = (ListInfo*)(request->m_payload); @@ -211,11 +212,24 @@ namespace SPTAG } p_postingListFullData = const_cast(postingListFullData.c_str()); } - else + else { p_postingListFullData = buffer + listInfo->pageOffset; } + // delta encoding + if (m_enableDeltaEncoding) { + ValueType* headVector = (ValueType*)p_index->GetSample(curPostingID); + for (char* vectorInfo = p_postingListFullData, *vectorInfoEnd = vectorInfo + listInfo->listEleCount * vectorInfoSize; vectorInfo < vectorInfoEnd; vectorInfo += vectorInfoSize) + { + ValueType* leaf = reinterpret_cast(vectorInfo + sizeof(int)); + for (auto i = 0; i < p_index->GetFeatureDim(); i++) + { + leaf[i] += headVector[i]; + } + } + } + ProcessPosting(p_postingListFullData, vectorInfoSize); }; #else // async read @@ -295,24 +309,42 @@ namespace SPTAG } } - std::string GetPostingListFullData(int indexPostingList, + std::string GetPostingListFullData( + int postingListId, size_t p_postingListSize, Selection& p_selections, - std::shared_ptr p_fullVectors) + std::shared_ptr p_fullVectors, + bool m_enableDeltaEncoding, + const ValueType *headVector) { std::string postingListFullData = ""; - size_t selectIdx = p_selections.lower_bound(indexPostingList); + size_t selectIdx = p_selections.lower_bound(postingListId); // iterate over all the vectors in the posting list for (int i = 0; i < p_postingListSize; ++i) { - if (p_selections[selectIdx].node != indexPostingList) + if (p_selections[selectIdx].node != postingListId) { - LOG(Helper::LogLevel::LL_Error, "Selection ID NOT MATCH! node:%d offset:%zu\n", indexPostingList, selectIdx); + LOG(Helper::LogLevel::LL_Error, "Selection ID NOT MATCH! node:%d offset:%zu\n", postingListId, selectIdx); exit(1); } int vid = p_selections[selectIdx++].tonode; postingListFullData.append(reinterpret_cast(&vid), sizeof(int)); - postingListFullData.append(reinterpret_cast(p_fullVectors->GetVector(vid)), p_fullVectors->PerVectorDataSize()); + ValueType* p_vector = reinterpret_cast(p_fullVectors->GetVector(vid)); + + if (m_enableDeltaEncoding) { + DimensionType n = p_fullVectors->Dimension(); + std::vector p_vector_delta(n); + for (auto j = 0; j < n; j++) + { + p_vector_delta[j] = p_vector[j] - headVector[j]; + } + postingListFullData.append(reinterpret_cast(&p_vector_delta[0]), p_fullVectors->PerVectorDataSize()); + } + else + { + postingListFullData.append(reinterpret_cast(p_vector), p_fullVectors->PerVectorDataSize()); + } + } return postingListFullData; } @@ -518,7 +550,8 @@ namespace SPTAG if (p_opt.m_distCalcMethod == DistCalcMethod::Cosine && !p_reader->IsNormalized() && !p_headIndex->m_pQuantizer) fullVectors->Normalize(p_opt.m_iSSDNumberOfThreads); // get compressed size of each posting list - LOG(Helper::LogLevel::LL_Info, "m_enableDataCompression: %s\n", p_opt.m_enableDataCompression ? "true" : "false"); + LOG(Helper::LogLevel::LL_Info, "EnableDeltaEncoding: %s\n", p_opt.m_enableDataCompression ? "true" : "false"); + LOG(Helper::LogLevel::LL_Info, "EnableDeltaEncoding: %s\n", p_opt.m_enableDeltaEncoding ? "true" : "false"); std::vector postingListBytes(headVectorIDS.size()); if (p_opt.m_enableDataCompression) { @@ -530,7 +563,9 @@ namespace SPTAG postingListBytes[i] = 0; continue; } - std::string postingListFullData = GetPostingListFullData(i, postingListSize[i], selections, fullVectors); + ValueType* headVector = (ValueType*)p_headIndex->GetSample(i); + std::string postingListFullData = GetPostingListFullData( + i, postingListSize[i], selections, fullVectors, p_opt.m_enableDeltaEncoding, headVector); size_t sizeToCompress = postingListSize[i] * vectorInfoSize; if (sizeToCompress != postingListFullData.size()) { LOG(Helper::LogLevel::LL_Error, "Size to compress NOT MATCH! PostingListFullData size: %zu sizeToCompress: %zu \n", postingListFullData.size(), sizeToCompress); @@ -575,10 +610,12 @@ namespace SPTAG if (p_opt.m_ssdIndexFileNum > 1) selections.LoadBatch(selectionsBatchOffset[i], selectionsBatchOffset[i + 1]); // write one file OutputSSDIndexFile((i == 0) ? outputFile : outputFile + "_" + std::to_string(i), + p_opt.m_enableDeltaEncoding, p_opt.m_enableDataCompression, vectorInfoSize, curPostingListSizes, curPostingListBytes, + p_headIndex, selections, postPageNum, postPageOffset, @@ -825,10 +862,12 @@ namespace SPTAG void OutputSSDIndexFile(const std::string& p_outputFile, + bool p_enableDeltaEncoding, bool p_enableDataCompression, size_t p_spacePerVector, const std::vector& p_postingListSizes, const std::vector& p_postingListBytes, + std::shared_ptr p_headIndex, Selection& p_postingSelections, const std::unique_ptr& p_postPageNum, const std::unique_ptr& p_postPageOffset, @@ -998,10 +1037,12 @@ namespace SPTAG if (p_postingListSizes[id]==0) { continue; } - int indexPostingList = id + (int)p_postingListOffset; + int postingListId = id + (int)p_postingListOffset; // get posting list full content and write it at once - std::string postingListFullData = GetPostingListFullData(indexPostingList, p_postingListSizes[id], p_postingSelections, p_fullVectors); - size_t postingListFullSize = p_postingListSizes[indexPostingList] * p_spacePerVector; + ValueType* headVector = (ValueType*)p_headIndex->GetSample(postingListId); + std::string postingListFullData = GetPostingListFullData( + postingListId, p_postingListSizes[id], p_postingSelections, p_fullVectors, p_enableDeltaEncoding, headVector); + size_t postingListFullSize = p_postingListSizes[id] * p_spacePerVector; if (postingListFullSize != postingListFullData.size()) { LOG(Helper::LogLevel::LL_Error, "posting list full data size NOT MATCH! postingListFullData.size(): %zu postingListFullSize: %zu \n", postingListFullData.size(), postingListFullSize); exit(1); @@ -1009,9 +1050,9 @@ namespace SPTAG if (p_enableDataCompression) { std::string compressedData = m_pCompressor->Compress(postingListFullData); size_t compressedSize = compressedData.size(); - if (compressedSize != p_postingListBytes[indexPostingList]) + if (compressedSize != p_postingListBytes[id]) { - LOG(Helper::LogLevel::LL_Error, "Compressed size NOT MATCH! compressed size:%zu, pre-calculated compressed size:%zu\n", compressedSize, p_postingListBytes[indexPostingList]); + LOG(Helper::LogLevel::LL_Error, "Compressed size NOT MATCH! compressed size:%zu, pre-calculated compressed size:%zu\n", compressedSize, p_postingListBytes[id]); exit(1); } if (ptr->WriteBinary(compressedSize, compressedData.data()) != compressedSize) { diff --git a/AnnService/inc/Core/SPANN/IExtraSearcher.h b/AnnService/inc/Core/SPANN/IExtraSearcher.h index 0cf20ebb..e8578542 100644 --- a/AnnService/inc/Core/SPANN/IExtraSearcher.h +++ b/AnnService/inc/Core/SPANN/IExtraSearcher.h @@ -166,6 +166,7 @@ namespace SPTAG { QueryResult& p_queryResults, std::shared_ptr p_index, SearchStats* p_stats, + bool m_enableDeltaEncoding = false, bool m_enableDataCompression = false, std::set* truth = nullptr, std::map>* found = nullptr) = 0; diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index 071c27d2..70bdd20c 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -79,6 +79,7 @@ namespace SPTAG { bool m_enableSSD; bool m_buildSsdIndex; int m_iSSDNumberOfThreads; + bool m_enableDeltaEncoding; bool m_enableDataCompression; // Building diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index b00d234f..f54b42a3 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -77,6 +77,7 @@ DefineBuildHeadParameter(m_buildHead, bool, false, "isExecute") DefineSSDParameter(m_enableSSD, bool, false, "isExecute") DefineSSDParameter(m_buildSsdIndex, bool, false, "BuildSsdIndex") DefineSSDParameter(m_iSSDNumberOfThreads, int, 16, "NumberOfThreads") +DefineSSDParameter(m_enableDeltaEncoding, bool, false, "EnableDeltaEncoding") DefineSSDParameter(m_enableDataCompression, bool, false, "EnableDataCompression") // Building diff --git a/AnnService/src/Core/SPANN/SPANNIndex.cpp b/AnnService/src/Core/SPANN/SPANNIndex.cpp index 7b880d31..c592de54 100644 --- a/AnnService/src/Core/SPANN/SPANNIndex.cpp +++ b/AnnService/src/Core/SPANN/SPANNIndex.cpp @@ -229,7 +229,7 @@ namespace SPTAG } p_queryResults->Reverse(); - m_extraSearcher->SearchIndex(workSpace.get(), *p_queryResults, m_index, nullptr, m_options.m_enableDataCompression); + m_extraSearcher->SearchIndex(workSpace.get(), *p_queryResults, m_index, nullptr, m_options.m_enableDeltaEncoding, m_options.m_enableDataCompression); p_queryResults->SortResult(); m_workSpacePool->Return(workSpace); } @@ -286,7 +286,7 @@ namespace SPTAG auto_ws->m_postingIDs.emplace_back(res->VID); } - m_extraSearcher->SearchIndex(auto_ws.get(), newResults, m_index, p_stats, m_options.m_enableDataCompression, truth, found); + m_extraSearcher->SearchIndex(auto_ws.get(), newResults, m_index, p_stats, m_options.m_enableDeltaEncoding, m_options.m_enableDataCompression, truth, found); } m_workSpacePool->Return(auto_ws); From 10035941d5a2d08d4504e4dd53a563927c4ba405 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Wed, 25 May 2022 09:25:25 +0800 Subject: [PATCH 03/25] support config of compress level (#3) --- AnnService/inc/Core/SPANN/Compressor.h | 6 +++--- AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h | 11 ++++++----- AnnService/inc/Core/SPANN/Options.h | 1 + AnnService/inc/Core/SPANN/ParameterDefinitionList.h | 1 + 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Compressor.h b/AnnService/inc/Core/SPANN/Compressor.h index 59952fc1..91ac8f93 100644 --- a/AnnService/inc/Core/SPANN/Compressor.h +++ b/AnnService/inc/Core/SPANN/Compressor.h @@ -13,9 +13,9 @@ namespace SPTAG { class Compressor { public: - Compressor() + Compressor(int level=0) { - compress_level = 1; + compress_level = level; } virtual ~Compressor(){} @@ -26,7 +26,7 @@ namespace SPTAG { std::string buffer{}; buffer.resize(est_comp_size); // TODO: reuse buffer size_t compressed_size = ZSTD_compress((void*)buffer.data(), est_comp_size, - src.data(), src.size(), 1); // TODO: change compress level + src.data(), src.size(), compress_level); buffer.resize(compressed_size); buffer.shrink_to_fit(); diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index a7e874a8..854b6b8f 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -156,6 +156,7 @@ namespace SPTAG #endif bool oneContext = (m_indexFiles.size() == 1); + std::unique_ptr m_pCompressor = std::make_unique(); // no need compress level info for decompress for (uint32_t pi = 0; pi < postingListCount; ++pi) { auto curPostingID = p_exWorkSpace->m_postingIDs[pi]; @@ -192,9 +193,8 @@ namespace SPTAG #ifdef BATCH_READ // async batch read auto vectorInfoSize = m_vectorInfoSize; - std::shared_ptr m_pCompressor; //TODO: reuse - request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, m_enableDeltaEncoding, curPostingID, m_enableDataCompression, m_pCompressor](Helper::AsyncReadRequest* request) + request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, m_enableDeltaEncoding, curPostingID, m_enableDataCompression, &m_pCompressor](Helper::AsyncReadRequest* request) { char* buffer = request->m_buffer; ListInfo* listInfo = (ListInfo*)(request->m_payload); @@ -550,9 +550,10 @@ namespace SPTAG if (p_opt.m_distCalcMethod == DistCalcMethod::Cosine && !p_reader->IsNormalized() && !p_headIndex->m_pQuantizer) fullVectors->Normalize(p_opt.m_iSSDNumberOfThreads); // get compressed size of each posting list - LOG(Helper::LogLevel::LL_Info, "EnableDeltaEncoding: %s\n", p_opt.m_enableDataCompression ? "true" : "false"); LOG(Helper::LogLevel::LL_Info, "EnableDeltaEncoding: %s\n", p_opt.m_enableDeltaEncoding ? "true" : "false"); + LOG(Helper::LogLevel::LL_Info, "EnableDataCompression: %s, ZstdCompressLevel: %d\n", p_opt.m_enableDataCompression ? "true" : "false", p_opt.m_zstdCompressLevel); std::vector postingListBytes(headVectorIDS.size()); + std::unique_ptr m_pCompressor = std::make_unique(p_opt.m_zstdCompressLevel); if (p_opt.m_enableDataCompression) { LOG(Helper::LogLevel::LL_Info, "Getting compressed size of each posting list...\n"); @@ -612,6 +613,7 @@ namespace SPTAG OutputSSDIndexFile((i == 0) ? outputFile : outputFile + "_" + std::to_string(i), p_opt.m_enableDeltaEncoding, p_opt.m_enableDataCompression, + m_pCompressor, vectorInfoSize, curPostingListSizes, curPostingListBytes, @@ -864,6 +866,7 @@ namespace SPTAG void OutputSSDIndexFile(const std::string& p_outputFile, bool p_enableDeltaEncoding, bool p_enableDataCompression, + const std::unique_ptr& m_pCompressor, size_t p_spacePerVector, const std::vector& p_postingListSizes, const std::vector& p_postingListBytes, @@ -1105,8 +1108,6 @@ namespace SPTAG std::vector> m_indexFiles; - std::unique_ptr m_pCompressor; // TOOD: not initialized - int m_vectorInfoSize = 0; int m_totalListCount = 0; diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index 70bdd20c..cfbc0e1d 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -81,6 +81,7 @@ namespace SPTAG { int m_iSSDNumberOfThreads; bool m_enableDeltaEncoding; bool m_enableDataCompression; + int m_zstdCompressLevel; // Building int m_replicaCount; diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index f54b42a3..14621582 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -79,6 +79,7 @@ DefineSSDParameter(m_buildSsdIndex, bool, false, "BuildSsdIndex") DefineSSDParameter(m_iSSDNumberOfThreads, int, 16, "NumberOfThreads") DefineSSDParameter(m_enableDeltaEncoding, bool, false, "EnableDeltaEncoding") DefineSSDParameter(m_enableDataCompression, bool, false, "EnableDataCompression") +DefineSSDParameter(m_zstdCompressLevel, int, 0, "ZstdCompressLevel") // Building DefineSSDParameter(m_internalResultNum, int, 64, "InternalResultNum") From d17a12692160e6bf9c2b43932cf934c886b37aec Mon Sep 17 00:00:00 2001 From: Guoxin Date: Wed, 25 May 2022 20:01:53 +0800 Subject: [PATCH 04/25] integrate zstd with cmake (#4) --- .gitmodules | 3 +++ AnnService/CMakeLists.txt | 4 ++-- CMakeLists.txt | 12 ++++++++++++ Dockerfile | 6 +++++- ThirdParty/zstd | 1 + 5 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 .gitmodules create mode 160000 ThirdParty/zstd diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..a84eabe0 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "ThirdParty/zstd"] + path = ThirdParty/zstd + url = https://github.com/facebook/zstd diff --git a/AnnService/CMakeLists.txt b/AnnService/CMakeLists.txt index a61b08a3..b8d6525e 100644 --- a/AnnService/CMakeLists.txt +++ b/AnnService/CMakeLists.txt @@ -32,9 +32,9 @@ if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") endif() add_library (SPTAGLib SHARED ${SRC_FILES} ${HDR_FILES}) -target_link_libraries (SPTAGLib DistanceUtils) +target_link_libraries (SPTAGLib DistanceUtils zstd::libzstd_shared) add_library (SPTAGLibStatic STATIC ${SRC_FILES} ${HDR_FILES}) -target_link_libraries (SPTAGLibStatic DistanceUtils) +target_link_libraries (SPTAGLibStatic DistanceUtils zstd::libzstd_static) if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") target_compile_options(SPTAGLibStatic PRIVATE -fPIC) endif() diff --git a/CMakeLists.txt b/CMakeLists.txt index 96713278..af3148a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,6 +90,18 @@ else() message (FATAL_ERROR "Could not find Boost >= 1.67!") endif() +find_package(zstd) +if (zstd_FOUND) + include_directories (${zstd_INCLUDE_DIR}) + link_directories (${zstd_LIBRARY_DIR}) + message (STATUS "Found zstd.") + message (STATUS "Include Path: ${zstd_INCLUDE_DIR}") + message (STATUS "Library Path: ${zstd_LIBRARY_DIR}") + message (STATUS "Library: ${zstd_LIBRARIES}") +else() + message (FATAL_ERROR "Could not find zstd") +endif() + option(GPU "GPU" ON) option(LIBRARYONLY "LIBRARYONLY" OFF) add_subdirectory (AnnService) diff --git a/Dockerfile b/Dockerfile index 576fa9a8..ef8be89c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,4 +15,8 @@ COPY Test ./Test/ COPY Wrappers ./Wrappers/ COPY GPUSupport ./GPUSupport/ -RUN mkdir build && cd build && cmake .. && make -j && cd .. +# install zstd +COPY ThirdParty ./ThirdParty/ +RUN cd ThirdParty/zstd/build/cmake && rm -rf builddir && mkdir builddir && cd builddir && cmake .. && make -j$(nproc) && make install + +RUN mkdir build && cd build && cmake .. && make -j$(nproc) diff --git a/ThirdParty/zstd b/ThirdParty/zstd new file mode 160000 index 00000000..9a5e73c7 --- /dev/null +++ b/ThirdParty/zstd @@ -0,0 +1 @@ +Subproject commit 9a5e73c74ef2d621992154306ab1ab6ba44ac8fa From 83efc3f88b70911641e95810826a55a0cee2e23d Mon Sep 17 00:00:00 2001 From: Guoxin Date: Sun, 29 May 2022 13:06:53 +0800 Subject: [PATCH 05/25] Bug Fix: wrong listPageCount when listTotalBytes % pageSize==0 (#5) --- AnnService/inc/Core/SPANN/Compressor.h | 24 +++++++- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 58 ++++++++++--------- 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Compressor.h b/AnnService/inc/Core/SPANN/Compressor.h index 91ac8f93..d15aef57 100644 --- a/AnnService/inc/Core/SPANN/Compressor.h +++ b/AnnService/inc/Core/SPANN/Compressor.h @@ -24,9 +24,14 @@ namespace SPTAG { { size_t est_comp_size = ZSTD_compressBound(src.size()); std::string buffer{}; - buffer.resize(est_comp_size); // TODO: reuse buffer + buffer.resize(est_comp_size); size_t compressed_size = ZSTD_compress((void*)buffer.data(), est_comp_size, src.data(), src.size(), compress_level); + if (ZSTD_isError(compressed_size)) + { + LOG(Helper::LogLevel::LL_Error, "ZSTD compress error %s, \n", ZSTD_getErrorName(compressed_size)); + exit(1); + } buffer.resize(compressed_size); buffer.shrink_to_fit(); @@ -35,11 +40,25 @@ namespace SPTAG { std::string Decompress(const char * src, size_t srcSize) { - size_t est_decomp_size = ZSTD_getDecompressedSize(src, srcSize); + size_t est_decomp_size = ZSTD_getFrameContentSize(src, srcSize); + if (est_decomp_size == ZSTD_CONTENTSIZE_ERROR) { + LOG(Helper::LogLevel::LL_Error, "not compressed by zstd!\n"); + exit(1); + } + else if (est_decomp_size == ZSTD_CONTENTSIZE_UNKNOWN) { + LOG(Helper::LogLevel::LL_Error, "original size unknown!\n"); + exit(1); + } std::string dst{}; + est_decomp_size *= 10; dst.resize(est_decomp_size); size_t const decomp_size = ZSTD_decompress( (void*)dst.data(), est_decomp_size, src, srcSize); + if (ZSTD_isError(decomp_size)) + { + LOG(Helper::LogLevel::LL_Error, "ZSTD decompress error %s, \n", ZSTD_getErrorName(decomp_size)); + exit(1); + } dst.resize(decomp_size); dst.shrink_to_fit(); @@ -55,7 +74,6 @@ namespace SPTAG { private: int compress_level; - //std::string buffer{}; std::string dictionary{}; }; } // SPANN diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 854b6b8f..2720627c 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -156,11 +156,10 @@ namespace SPTAG #endif bool oneContext = (m_indexFiles.size() == 1); - std::unique_ptr m_pCompressor = std::make_unique(); // no need compress level info for decompress + auto m_pCompressor = std::make_unique(); // no need compress level to decompress for (uint32_t pi = 0; pi < postingListCount; ++pi) { auto curPostingID = p_exWorkSpace->m_postingIDs[pi]; - int fileid = 0; ListInfo* listInfo; if (oneContext) { @@ -200,21 +199,20 @@ namespace SPTAG ListInfo* listInfo = (ListInfo*)(request->m_payload); // decompress posting list - char* p_postingListFullData; - std::string postingListFullData = ""; + char* p_postingListFullData = buffer + listInfo->pageOffset; + std::string postingListFullData(""); if (m_enableDataCompression) { - postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes); + if (listInfo->listEleCount != 0) + { + postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes); + } if (postingListFullData.size() != listInfo->listEleCount * vectorInfoSize) { LOG(Helper::LogLevel::LL_Info, "postingListFullData size not match! %zu, %d, \n", postingListFullData.size(), listInfo->listEleCount * vectorInfoSize); exit(1); } - p_postingListFullData = const_cast(postingListFullData.c_str()); - } - else - { - p_postingListFullData = buffer + listInfo->pageOffset; + p_postingListFullData = &postingListFullData[0]; } // delta encoding @@ -230,7 +228,7 @@ namespace SPTAG } } - ProcessPosting(p_postingListFullData, vectorInfoSize); + ProcessPosting(const_cast(p_postingListFullData), vectorInfoSize); }; #else // async read request.m_callback = [&p_exWorkSpace](Helper::AsyncReadRequest* request) @@ -314,10 +312,10 @@ namespace SPTAG size_t p_postingListSize, Selection& p_selections, std::shared_ptr p_fullVectors, - bool m_enableDeltaEncoding, - const ValueType *headVector) + bool m_enableDeltaEncoding=false, + const ValueType *headVector=nullptr) { - std::string postingListFullData = ""; + std::string postingListFullData(""); size_t selectIdx = p_selections.lower_bound(postingListId); // iterate over all the vectors in the posting list for (int i = 0; i < p_postingListSize; ++i) @@ -553,7 +551,7 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Info, "EnableDeltaEncoding: %s\n", p_opt.m_enableDeltaEncoding ? "true" : "false"); LOG(Helper::LogLevel::LL_Info, "EnableDataCompression: %s, ZstdCompressLevel: %d\n", p_opt.m_enableDataCompression ? "true" : "false", p_opt.m_zstdCompressLevel); std::vector postingListBytes(headVectorIDS.size()); - std::unique_ptr m_pCompressor = std::make_unique(p_opt.m_zstdCompressLevel); + auto m_pCompressor = std::make_unique(p_opt.m_zstdCompressLevel); if (p_opt.m_enableDataCompression) { LOG(Helper::LogLevel::LL_Info, "Getting compressed size of each posting list...\n"); @@ -564,7 +562,11 @@ namespace SPTAG postingListBytes[i] = 0; continue; } - ValueType* headVector = (ValueType*)p_headIndex->GetSample(i); + ValueType* headVector = nullptr; + if (p_opt.m_enableDeltaEncoding) + { + headVector = (ValueType*)p_headIndex->GetSample(i); + } std::string postingListFullData = GetPostingListFullData( i, postingListSize[i], selections, fullVectors, p_opt.m_enableDeltaEncoding, headVector); size_t sizeToCompress = postingListSize[i] * vectorInfoSize; @@ -572,7 +574,7 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Error, "Size to compress NOT MATCH! PostingListFullData size: %zu sizeToCompress: %zu \n", postingListFullData.size(), sizeToCompress); } postingListBytes[i] = m_pCompressor->GetCompressedSize(postingListFullData); - if (i % 10000 == 0) { + if (i % 10000 == 0 || postingListBytes[i] > p_opt.m_postingPageLimit * PageSize) { LOG(Helper::LogLevel::LL_Info, "Posting list %d/%d, compressed size: %d, compression ratio: %.4f\n", i, postingListSize.size(), postingListBytes[i], postingListBytes[i] / float(sizeToCompress)); } } @@ -813,8 +815,7 @@ namespace SPTAG } listInfo.id = static_cast(i); - size_t postingListByte = p_postingListBytes[i]; - listInfo.rest = static_cast(postingListByte % PageSize); + listInfo.rest = static_cast(p_postingListBytes[i] % PageSize); listRestSize.insert(listInfo); } @@ -828,7 +829,7 @@ namespace SPTAG { listInfo.rest = PageSize - currOffset; auto iter = listRestSize.lower_bound(listInfo); // avoid page-crossing - if (iter == listRestSize.end()) + if (iter == listRestSize.end() || (listInfo.rest != PageSize && iter->rest == 0)) { ++currPageNum; currOffset = 0; @@ -862,10 +863,9 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Info, "TotalPageNumbers: %d, IndexSize: %llu\n", currPageNum, static_cast(currPageNum) * PageSize + currOffset); } - void OutputSSDIndexFile(const std::string& p_outputFile, - bool p_enableDeltaEncoding, - bool p_enableDataCompression, + bool m_enableDeltaEncoding, + bool m_enableDataCompression, const std::unique_ptr& m_pCompressor, size_t p_spacePerVector, const std::vector& p_postingListSizes, @@ -1038,19 +1038,23 @@ namespace SPTAG } if (p_postingListSizes[id]==0) { - continue; + continue; } int postingListId = id + (int)p_postingListOffset; // get posting list full content and write it at once - ValueType* headVector = (ValueType*)p_headIndex->GetSample(postingListId); + ValueType* headVector = nullptr; + if (m_enableDeltaEncoding) + { + headVector = (ValueType*)p_headIndex->GetSample(postingListId); + } std::string postingListFullData = GetPostingListFullData( - postingListId, p_postingListSizes[id], p_postingSelections, p_fullVectors, p_enableDeltaEncoding, headVector); + postingListId, p_postingListSizes[id], p_postingSelections, p_fullVectors, m_enableDeltaEncoding, headVector); size_t postingListFullSize = p_postingListSizes[id] * p_spacePerVector; if (postingListFullSize != postingListFullData.size()) { LOG(Helper::LogLevel::LL_Error, "posting list full data size NOT MATCH! postingListFullData.size(): %zu postingListFullSize: %zu \n", postingListFullData.size(), postingListFullSize); exit(1); } - if (p_enableDataCompression) { + if (m_enableDataCompression) { std::string compressedData = m_pCompressor->Compress(postingListFullData); size_t compressedSize = compressedData.size(); if (compressedSize != p_postingListBytes[id]) From 0f33ca37d94197dc4fcb88023c0f1db682c810cc Mon Sep 17 00:00:00 2001 From: Guoxin Date: Mon, 30 May 2022 10:21:02 +0800 Subject: [PATCH 06/25] train & share dictionary (#6) --- AnnService/inc/Core/SPANN/Compressor.h | 146 ++++++++++++++++-- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 125 +++++++++++++-- AnnService/inc/Core/SPANN/IExtraSearcher.h | 2 - AnnService/inc/Core/SPANN/Options.h | 1 + .../inc/Core/SPANN/ParameterDefinitionList.h | 1 + AnnService/src/Core/SPANN/SPANNIndex.cpp | 4 +- 6 files changed, 247 insertions(+), 32 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Compressor.h b/AnnService/inc/Core/SPANN/Compressor.h index d15aef57..133868bc 100644 --- a/AnnService/inc/Core/SPANN/Compressor.h +++ b/AnnService/inc/Core/SPANN/Compressor.h @@ -6,27 +6,127 @@ #include #include "zstd.h" +#include "zdict.h" #include "../Common.h" -namespace SPTAG { - namespace SPANN { +namespace SPTAG +{ + namespace SPANN + { class Compressor { public: - Compressor(int level=0) + Compressor(int level = 0, int bufferCapacity = 102400) { compress_level = level; + dictBufferCapacity = bufferCapacity; } - virtual ~Compressor(){} - - std::string Compress(const std::string &src) + virtual ~Compressor() {} + + void TrainDict(std::string samplesBuffer, const size_t *samplesSizes, unsigned nbSamples) + { + dictBuffer.resize(dictBufferCapacity); + size_t dictSize = ZDICT_trainFromBuffer((void *)dictBuffer.data(), dictBufferCapacity, (void *)samplesBuffer.data(), &samplesSizes[0], nbSamples); + if (ZDICT_isError(dictSize)) + { + LOG(Helper::LogLevel::LL_Error, "ZDICT_trainFromBuffer() failed: %s \n", ZDICT_getErrorName(dictSize)); + exit(1); + } + dictBuffer.resize(dictSize); + dictBuffer.shrink_to_fit(); + } + + std::string GetDictBuffer() + { + return dictBuffer; + } + + void SetDictBuffer(std::string buffer) + { + dictBuffer = buffer; + } + + void CreateCDict() + { + cdict = ZSTD_createCDict((void *)dictBuffer.data(), dictBuffer.size(), compress_level); + if (cdict == NULL) + { + LOG(Helper::LogLevel::LL_Error, "ZSTD_createCDict() failed! \n"); + exit(1); + } + } + + void CreateDDict() + { + ddict = ZSTD_createDDict((void *)dictBuffer.data(), dictBuffer.size()); + if (ddict == NULL) + { + LOG(Helper::LogLevel::LL_Error, "ZSTD_createDDict() failed! \n"); + exit(1); + } + } + + std::string CompressWithDict(const std::string &src) + { + size_t est_compress_size = ZSTD_compressBound(src.size()); + std::string comp_buffer{}; + comp_buffer.resize(est_compress_size); + + ZSTD_CCtx *const cctx = ZSTD_createCCtx(); + if (cctx == NULL) + { + LOG(Helper::LogLevel::LL_Error, "ZSTD_createCCtx() failed! \n"); + exit(1); + } + size_t compressed_size = ZSTD_compress_usingCDict(cctx, (void *)comp_buffer.data(), est_compress_size, src.data(), src.size(), cdict); + if (ZSTD_isError(compressed_size)) + { + LOG(Helper::LogLevel::LL_Error, "ZSTD compress error %s, \n", ZSTD_getErrorName(compressed_size)); + exit(1); + } + ZSTD_freeCCtx(cctx); + comp_buffer.resize(compressed_size); + comp_buffer.shrink_to_fit(); + + return comp_buffer; + } + + std::string DecompressWithDict(const char *src, size_t srcSize) + { + auto const est_decomp_size = + ZSTD_getFrameContentSize(src, srcSize); + + std::string decomp_buffer{}; + decomp_buffer.resize(est_decomp_size); + + ZSTD_DCtx *const dctx = ZSTD_createDCtx(); + if (dctx == NULL) + { + LOG(Helper::LogLevel::LL_Error, "ZSTD_createDCtx() failed! \n"); + exit(1); + } + size_t const decomp_size = ZSTD_decompress_usingDDict(dctx, + (void *)decomp_buffer.data(), est_decomp_size, src, srcSize, ddict); + if (ZSTD_isError(decomp_size)) + { + LOG(Helper::LogLevel::LL_Error, "ZSTD decompress error %s, \n", ZSTD_getErrorName(decomp_size)); + exit(1); + } + + ZSTD_freeDCtx(dctx); + decomp_buffer.resize(decomp_size); + decomp_buffer.shrink_to_fit(); + return decomp_buffer; + } + + std::string CompressWithoutDict(const std::string &src) { size_t est_comp_size = ZSTD_compressBound(src.size()); std::string buffer{}; buffer.resize(est_comp_size); - size_t compressed_size = ZSTD_compress((void*)buffer.data(), est_comp_size, - src.data(), src.size(), compress_level); + size_t compressed_size = ZSTD_compress((void *)buffer.data(), est_comp_size, + src.data(), src.size(), compress_level); if (ZSTD_isError(compressed_size)) { LOG(Helper::LogLevel::LL_Error, "ZSTD compress error %s, \n", ZSTD_getErrorName(compressed_size)); @@ -38,14 +138,16 @@ namespace SPTAG { return buffer; } - std::string Decompress(const char * src, size_t srcSize) + std::string DecompressWithoutDict(const char *src, size_t srcSize) { size_t est_decomp_size = ZSTD_getFrameContentSize(src, srcSize); - if (est_decomp_size == ZSTD_CONTENTSIZE_ERROR) { + if (est_decomp_size == ZSTD_CONTENTSIZE_ERROR) + { LOG(Helper::LogLevel::LL_Error, "not compressed by zstd!\n"); exit(1); } - else if (est_decomp_size == ZSTD_CONTENTSIZE_UNKNOWN) { + else if (est_decomp_size == ZSTD_CONTENTSIZE_UNKNOWN) + { LOG(Helper::LogLevel::LL_Error, "original size unknown!\n"); exit(1); } @@ -53,7 +155,7 @@ namespace SPTAG { est_decomp_size *= 10; dst.resize(est_decomp_size); size_t const decomp_size = ZSTD_decompress( - (void*)dst.data(), est_decomp_size, src, srcSize); + (void *)dst.data(), est_decomp_size, src, srcSize); if (ZSTD_isError(decomp_size)) { LOG(Helper::LogLevel::LL_Error, "ZSTD decompress error %s, \n", ZSTD_getErrorName(decomp_size)); @@ -65,16 +167,30 @@ namespace SPTAG { return dst; } + std::string Compress(const std::string &src, const bool useDict) + { + return useDict ? CompressWithDict(src) : CompressWithoutDict(src); + } + + std::string Decompress(const char *src, size_t srcSize, const bool useDict) + { + return useDict ? DecompressWithDict(src, srcSize) : DecompressWithoutDict(src, srcSize); + } + // return the compressed sie - size_t GetCompressedSize(const std::string &src) + size_t GetCompressedSize(const std::string &src, bool useDict) { - std::string dst = Compress(src); + std::string dst = Compress(src, useDict); return dst.size(); } private: int compress_level; - std::string dictionary{}; + + std::string dictBuffer; + size_t dictBufferCapacity; + ZSTD_CDict *cdict; + ZSTD_DDict *ddict; }; } // SPANN } // SPTAG diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 2720627c..70c61dd9 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -139,8 +139,6 @@ namespace SPTAG QueryResult& p_queryResults, std::shared_ptr p_index, SearchStats* p_stats, - bool m_enableDeltaEncoding, - bool m_enableDataCompression, std::set* truth, std::map>* found) { const uint32_t postingListCount = static_cast(p_exWorkSpace->m_postingIDs.size()); @@ -156,7 +154,7 @@ namespace SPTAG #endif bool oneContext = (m_indexFiles.size() == 1); - auto m_pCompressor = std::make_unique(); // no need compress level to decompress + for (uint32_t pi = 0; pi < postingListCount; ++pi) { auto curPostingID = p_exWorkSpace->m_postingIDs[pi]; @@ -193,7 +191,7 @@ namespace SPTAG #ifdef BATCH_READ // async batch read auto vectorInfoSize = m_vectorInfoSize; - request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, m_enableDeltaEncoding, curPostingID, m_enableDataCompression, &m_pCompressor](Helper::AsyncReadRequest* request) + request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, curPostingID, m_enableDeltaEncoding=m_enableDeltaEncoding, m_enableDictTraining= m_enableDictTraining, m_enableDataCompression= m_enableDataCompression, &m_pCompressor=m_pCompressor](Helper::AsyncReadRequest* request) { char* buffer = request->m_buffer; ListInfo* listInfo = (ListInfo*)(request->m_payload); @@ -205,7 +203,7 @@ namespace SPTAG { if (listInfo->listEleCount != 0) { - postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes); + postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, m_enableDictTraining); } if (postingListFullData.size() != listInfo->listEleCount * vectorInfoSize) { @@ -280,7 +278,7 @@ namespace SPTAG char* p_postingListFullData; if (m_enableDataCompression) { - std::string postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes); + std::string postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, m_enableDictTraining); p_postingListFullData = const_cast(postingListFullData.c_str()); } else @@ -551,10 +549,38 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Info, "EnableDeltaEncoding: %s\n", p_opt.m_enableDeltaEncoding ? "true" : "false"); LOG(Helper::LogLevel::LL_Info, "EnableDataCompression: %s, ZstdCompressLevel: %d\n", p_opt.m_enableDataCompression ? "true" : "false", p_opt.m_zstdCompressLevel); std::vector postingListBytes(headVectorIDS.size()); - auto m_pCompressor = std::make_unique(p_opt.m_zstdCompressLevel); + m_pCompressor = std::make_unique(p_opt.m_zstdCompressLevel); if (p_opt.m_enableDataCompression) { LOG(Helper::LogLevel::LL_Info, "Getting compressed size of each posting list...\n"); + + LOG(Helper::LogLevel::LL_Info, "Training dictionary...\n"); + unsigned nbSamples = 0; + std::string samplesBuffer(""); + std::vector samplesSizes; + for (int i = 0; i < postingListSize.size(); i++) { + // do not compress if no data + if (postingListSize[i] == 0) { + postingListBytes[i] = 0; + continue; + } + ValueType* headVector = nullptr; + if (p_opt.m_enableDeltaEncoding) + { + headVector = (ValueType*)p_headIndex->GetSample(i); + } + std::string postingListFullData = GetPostingListFullData( + i, postingListSize[i], selections, fullVectors, p_opt.m_enableDeltaEncoding, headVector); + + samplesBuffer += postingListFullData; + samplesSizes.push_back(postingListFullData.size()); + if (samplesBuffer.size() > 102400) break; + } + LOG(Helper::LogLevel::LL_Info, "Using the first %zu postingLists to train dictionary... \n", samplesSizes.size()); + m_pCompressor->TrainDict(samplesBuffer, &samplesSizes[0], samplesSizes.size()); + m_pCompressor->CreateCDict(); + LOG(Helper::LogLevel::LL_Info, "Dictionary trained.\n"); + // TODO: omp parallel for (int i = 0; i < postingListSize.size(); i++) { // do not compress if no data @@ -573,8 +599,8 @@ namespace SPTAG if (sizeToCompress != postingListFullData.size()) { LOG(Helper::LogLevel::LL_Error, "Size to compress NOT MATCH! PostingListFullData size: %zu sizeToCompress: %zu \n", postingListFullData.size(), sizeToCompress); } - postingListBytes[i] = m_pCompressor->GetCompressedSize(postingListFullData); - if (i % 10000 == 0 || postingListBytes[i] > p_opt.m_postingPageLimit * PageSize) { + postingListBytes[i] = m_pCompressor->GetCompressedSize(postingListFullData, true); + if (i % 10000 == 0 || postingListBytes[i] > static_cast(p_opt.m_postingPageLimit) * PageSize) { LOG(Helper::LogLevel::LL_Info, "Posting list %d/%d, compressed size: %d, compression ratio: %.4f\n", i, postingListSize.size(), postingListBytes[i], postingListBytes[i] / float(sizeToCompress)); } } @@ -615,7 +641,7 @@ namespace SPTAG OutputSSDIndexFile((i == 0) ? outputFile : outputFile + "_" + std::to_string(i), p_opt.m_enableDeltaEncoding, p_opt.m_enableDataCompression, - m_pCompressor, + p_opt.m_enableDictTraining, vectorInfoSize, curPostingListSizes, curPostingListBytes, @@ -672,6 +698,7 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Error, "Failed to open file: %s\n", p_file.c_str()); exit(1); } + m_pCompressor = std::make_unique(); // no need compress level to decompress int m_listCount; int m_totalDocumentCount; @@ -755,6 +782,37 @@ namespace SPTAG } } + if (ptr->ReadBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } + if (ptr->ReadBinary(sizeof(m_enableDataCompression), reinterpret_cast(&m_enableDataCompression)) != sizeof(m_enableDataCompression)) { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } + if (ptr->ReadBinary(sizeof(m_enableDictTraining), reinterpret_cast(&m_enableDictTraining)) != sizeof(m_enableDictTraining)) { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } + + if (m_enableDataCompression && m_enableDictTraining) + { + size_t dictBufferSize; + + if (ptr->ReadBinary(sizeof(size_t), reinterpret_cast(&dictBufferSize)) != sizeof(dictBufferSize)) { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } + char* dictBuffer = new char[dictBufferSize]; + if (ptr->ReadBinary(dictBufferSize, dictBuffer) != dictBufferSize) { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } + m_pCompressor->SetDictBuffer(std::string(dictBuffer, dictBufferSize)); + delete[] dictBuffer; + m_pCompressor->CreateDDict(); + } + LOG(Helper::LogLevel::LL_Info, "Finish reading header info, list count %d, total doc count %d, dimension %d, list page offset %d.\n", m_listCount, @@ -866,7 +924,7 @@ namespace SPTAG void OutputSSDIndexFile(const std::string& p_outputFile, bool m_enableDeltaEncoding, bool m_enableDataCompression, - const std::unique_ptr& m_pCompressor, + bool m_enableDictTraining, size_t p_spacePerVector, const std::vector& p_postingListSizes, const std::vector& p_postingListBytes, @@ -899,7 +957,15 @@ namespace SPTAG std::uint64_t listOffset = sizeof(int) * 4; // meta size of the posting lists listOffset += (sizeof(size_t) + sizeof(int) + sizeof(std::uint16_t) + sizeof(int) + sizeof(std::uint16_t)) * p_postingListSizes.size(); - + + listOffset += sizeof(bool) * 3; + // compression dict + if (m_enableDataCompression && m_enableDictTraining) + { + listOffset += sizeof(size_t); + listOffset += m_pCompressor->GetDictBuffer().size(); + } + std::unique_ptr paddingVals(new char[PageSize]); memset(paddingVals.get(), 0, sizeof(char) * PageSize); // paddingSize: bytes left in the last page @@ -988,6 +1054,35 @@ namespace SPTAG exit(1); } } + // m_enableDeltaEncoding, + if (ptr->WriteBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) { + LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); + exit(1); + } + if (ptr->WriteBinary(sizeof(m_enableDataCompression), reinterpret_cast(&m_enableDataCompression)) != sizeof(m_enableDataCompression)) { + LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); + exit(1); + } + if (ptr->WriteBinary(sizeof(m_enableDictTraining), reinterpret_cast(&m_enableDictTraining)) != sizeof(m_enableDictTraining)) { + LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); + exit(1); + } + // compression dict + if (m_enableDataCompression && m_enableDictTraining) + { + std::string dictBuffer = m_pCompressor->GetDictBuffer(); + // dict size + size_t dictBufferSize = dictBuffer.size(); + if (ptr->WriteBinary(sizeof(size_t), reinterpret_cast(&dictBufferSize)) != sizeof(dictBufferSize)) { + LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); + exit(1); + } + // dict + if (ptr->WriteBinary(dictBuffer.size(), const_cast(dictBuffer.data())) != dictBuffer.size()) { + LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); + exit(1); + } + } // Write padding vals if (paddingSize > 0) @@ -1055,7 +1150,7 @@ namespace SPTAG exit(1); } if (m_enableDataCompression) { - std::string compressedData = m_pCompressor->Compress(postingListFullData); + std::string compressedData = m_pCompressor->Compress(postingListFullData, m_enableDictTraining); size_t compressedSize = compressedData.size(); if (compressedSize != p_postingListBytes[id]) { @@ -1111,6 +1206,10 @@ namespace SPTAG std::vector> m_listInfos; std::vector> m_indexFiles; + std::unique_ptr m_pCompressor; + bool m_enableDeltaEncoding; + bool m_enableDataCompression; + bool m_enableDictTraining; int m_vectorInfoSize = 0; diff --git a/AnnService/inc/Core/SPANN/IExtraSearcher.h b/AnnService/inc/Core/SPANN/IExtraSearcher.h index e8578542..7def4813 100644 --- a/AnnService/inc/Core/SPANN/IExtraSearcher.h +++ b/AnnService/inc/Core/SPANN/IExtraSearcher.h @@ -166,8 +166,6 @@ namespace SPTAG { QueryResult& p_queryResults, std::shared_ptr p_index, SearchStats* p_stats, - bool m_enableDeltaEncoding = false, - bool m_enableDataCompression = false, std::set* truth = nullptr, std::map>* found = nullptr) = 0; diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index cfbc0e1d..4306dbc4 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -81,6 +81,7 @@ namespace SPTAG { int m_iSSDNumberOfThreads; bool m_enableDeltaEncoding; bool m_enableDataCompression; + bool m_enableDictTraining; int m_zstdCompressLevel; // Building diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index 14621582..2904a19f 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -79,6 +79,7 @@ DefineSSDParameter(m_buildSsdIndex, bool, false, "BuildSsdIndex") DefineSSDParameter(m_iSSDNumberOfThreads, int, 16, "NumberOfThreads") DefineSSDParameter(m_enableDeltaEncoding, bool, false, "EnableDeltaEncoding") DefineSSDParameter(m_enableDataCompression, bool, false, "EnableDataCompression") +DefineSSDParameter(m_enableDictTraining, bool, true, "EnableDictTraining") DefineSSDParameter(m_zstdCompressLevel, int, 0, "ZstdCompressLevel") // Building diff --git a/AnnService/src/Core/SPANN/SPANNIndex.cpp b/AnnService/src/Core/SPANN/SPANNIndex.cpp index c592de54..fe8c526a 100644 --- a/AnnService/src/Core/SPANN/SPANNIndex.cpp +++ b/AnnService/src/Core/SPANN/SPANNIndex.cpp @@ -229,7 +229,7 @@ namespace SPTAG } p_queryResults->Reverse(); - m_extraSearcher->SearchIndex(workSpace.get(), *p_queryResults, m_index, nullptr, m_options.m_enableDeltaEncoding, m_options.m_enableDataCompression); + m_extraSearcher->SearchIndex(workSpace.get(), *p_queryResults, m_index, nullptr); p_queryResults->SortResult(); m_workSpacePool->Return(workSpace); } @@ -286,7 +286,7 @@ namespace SPTAG auto_ws->m_postingIDs.emplace_back(res->VID); } - m_extraSearcher->SearchIndex(auto_ws.get(), newResults, m_index, p_stats, m_options.m_enableDeltaEncoding, m_options.m_enableDataCompression, truth, found); + m_extraSearcher->SearchIndex(auto_ws.get(), newResults, m_index, p_stats, truth, found); } m_workSpacePool->Return(auto_ws); From 17a2150a554aa481008bca980df22daaf4545a9e Mon Sep 17 00:00:00 2001 From: Guoxin Date: Mon, 30 May 2022 21:09:05 +0800 Subject: [PATCH 07/25] rearrange posting list (#7) --- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 138 ++++++++++++------ AnnService/inc/Core/SPANN/Options.h | 1 + .../inc/Core/SPANN/ParameterDefinitionList.h | 1 + 3 files changed, 96 insertions(+), 44 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 70c61dd9..5c057465 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -81,11 +81,13 @@ namespace SPTAG } }; -#define ProcessPosting(p_postingListFullData, vectorInfoSize) \ - for (char *vectorInfo = p_postingListFullData, *vectorInfoEnd = vectorInfo + listInfo->listEleCount * vectorInfoSize; vectorInfo < vectorInfoEnd; vectorInfo += vectorInfoSize) { \ - int vectorID = *(reinterpret_cast(vectorInfo)); \ +#define ProcessPosting(p_postingListFullData, vectorInfoSize, m_enablePostingListRearrange) \ + for (int i = 0; i < listInfo->listEleCount; i++) { \ + uint64_t offsetVectorID = m_enablePostingListRearrange ? (vectorInfoSize - sizeof(int)) * listInfo->listEleCount + sizeof(int) * i : vectorInfoSize * i; \ + int vectorID = *(reinterpret_cast(p_postingListFullData + offsetVectorID));\ if (p_exWorkSpace->m_deduper.CheckAndSet(vectorID)) continue; \ - auto distance2leaf = p_index->ComputeDistance(queryResults.GetQuantizedTarget(), vectorInfo + sizeof(int)); \ + uint64_t offsetVector = m_enablePostingListRearrange ? (vectorInfoSize - sizeof(int)) * i : vectorInfoSize * i + sizeof(int); \ + auto distance2leaf = p_index->ComputeDistance(queryResults.GetQuantizedTarget(), p_postingListFullData + offsetVector); \ queryResults.AddPoint(vectorID, distance2leaf); \ } \ @@ -191,7 +193,7 @@ namespace SPTAG #ifdef BATCH_READ // async batch read auto vectorInfoSize = m_vectorInfoSize; - request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, curPostingID, m_enableDeltaEncoding=m_enableDeltaEncoding, m_enableDictTraining= m_enableDictTraining, m_enableDataCompression= m_enableDataCompression, &m_pCompressor=m_pCompressor](Helper::AsyncReadRequest* request) + request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, curPostingID, m_enableDeltaEncoding = m_enableDeltaEncoding, m_enablePostingListRearrange = m_enablePostingListRearrange, m_enableDictTraining = m_enableDictTraining, m_enableDataCompression = m_enableDataCompression, &m_pCompressor = m_pCompressor](Helper::AsyncReadRequest *request) { char* buffer = request->m_buffer; ListInfo* listInfo = (ListInfo*)(request->m_payload); @@ -214,11 +216,12 @@ namespace SPTAG } // delta encoding - if (m_enableDeltaEncoding) { - ValueType* headVector = (ValueType*)p_index->GetSample(curPostingID); - for (char* vectorInfo = p_postingListFullData, *vectorInfoEnd = vectorInfo + listInfo->listEleCount * vectorInfoSize; vectorInfo < vectorInfoEnd; vectorInfo += vectorInfoSize) + if (m_enableDeltaEncoding) + { + ValueType *headVector = (ValueType *)p_index->GetSample(curPostingID); + for (int i = 0; i < listInfo->listEleCount; i++) { - ValueType* leaf = reinterpret_cast(vectorInfo + sizeof(int)); + ValueType *leaf = m_enablePostingListRearrange ? reinterpret_cast(p_postingListFullData + (vectorInfoSize - sizeof(int)) * i) : reinterpret_cast(p_postingListFullData + vectorInfoSize * i + sizeof(int)); for (auto i = 0; i < p_index->GetFeatureDim(); i++) { leaf[i] += headVector[i]; @@ -226,7 +229,7 @@ namespace SPTAG } } - ProcessPosting(const_cast(p_postingListFullData), vectorInfoSize); + ProcessPosting(const_cast(p_postingListFullData), vectorInfoSize, m_enablePostingListRearrange); }; #else // async read request.m_callback = [&p_exWorkSpace](Helper::AsyncReadRequest* request) @@ -308,12 +311,15 @@ namespace SPTAG std::string GetPostingListFullData( int postingListId, size_t p_postingListSize, - Selection& p_selections, + Selection &p_selections, std::shared_ptr p_fullVectors, - bool m_enableDeltaEncoding=false, - const ValueType *headVector=nullptr) + bool m_enableDeltaEncoding = false, + bool m_enablePostingListRearrange = false, + const ValueType *headVector = nullptr) { std::string postingListFullData(""); + std::string vectors(""); + std::string vectorIDs(""); size_t selectIdx = p_selections.lower_bound(postingListId); // iterate over all the vectors in the posting list for (int i = 0; i < p_postingListSize; ++i) @@ -323,24 +329,41 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Error, "Selection ID NOT MATCH! node:%d offset:%zu\n", postingListId, selectIdx); exit(1); } + std::string vectorID(""); + std::string vector(""); + int vid = p_selections[selectIdx++].tonode; - postingListFullData.append(reinterpret_cast(&vid), sizeof(int)); - ValueType* p_vector = reinterpret_cast(p_fullVectors->GetVector(vid)); + vectorID.append(reinterpret_cast(&vid), sizeof(int)); - if (m_enableDeltaEncoding) { + ValueType *p_vector = reinterpret_cast(p_fullVectors->GetVector(vid)); + if (m_enableDeltaEncoding) + { DimensionType n = p_fullVectors->Dimension(); std::vector p_vector_delta(n); for (auto j = 0; j < n; j++) { p_vector_delta[j] = p_vector[j] - headVector[j]; } - postingListFullData.append(reinterpret_cast(&p_vector_delta[0]), p_fullVectors->PerVectorDataSize()); + vector.append(reinterpret_cast(&p_vector_delta[0]), p_fullVectors->PerVectorDataSize()); + } + else + { + vector.append(reinterpret_cast(p_vector), p_fullVectors->PerVectorDataSize()); + } + + if (m_enablePostingListRearrange) + { + vectorIDs += vectorID; + vectors += vector; } else { - postingListFullData.append(reinterpret_cast(p_vector), p_fullVectors->PerVectorDataSize()); + postingListFullData += (vectorID + vector); } - + } + if (m_enablePostingListRearrange) + { + return vectors + vectorIDs; } return postingListFullData; } @@ -570,7 +593,7 @@ namespace SPTAG headVector = (ValueType*)p_headIndex->GetSample(i); } std::string postingListFullData = GetPostingListFullData( - i, postingListSize[i], selections, fullVectors, p_opt.m_enableDeltaEncoding, headVector); + i, postingListSize[i], selections, fullVectors, p_opt.m_enableDeltaEncoding, p_opt.m_enablePostingListRearrange, headVector); samplesBuffer += postingListFullData; samplesSizes.push_back(postingListFullData.size()); @@ -594,7 +617,7 @@ namespace SPTAG headVector = (ValueType*)p_headIndex->GetSample(i); } std::string postingListFullData = GetPostingListFullData( - i, postingListSize[i], selections, fullVectors, p_opt.m_enableDeltaEncoding, headVector); + i, postingListSize[i], selections, fullVectors, p_opt.m_enableDeltaEncoding, p_opt.m_enablePostingListRearrange, headVector); size_t sizeToCompress = postingListSize[i] * vectorInfoSize; if (sizeToCompress != postingListFullData.size()) { LOG(Helper::LogLevel::LL_Error, "Size to compress NOT MATCH! PostingListFullData size: %zu sizeToCompress: %zu \n", postingListFullData.size(), sizeToCompress); @@ -640,6 +663,7 @@ namespace SPTAG // write one file OutputSSDIndexFile((i == 0) ? outputFile : outputFile + "_" + std::to_string(i), p_opt.m_enableDeltaEncoding, + p_opt.m_enablePostingListRearrange, p_opt.m_enableDataCompression, p_opt.m_enableDictTraining, vectorInfoSize, @@ -782,15 +806,23 @@ namespace SPTAG } } - if (ptr->ReadBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) { + if (ptr->ReadBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) + { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } + if (ptr->ReadBinary(sizeof(m_enablePostingListRearrange), reinterpret_cast(&m_enablePostingListRearrange)) != sizeof(m_enablePostingListRearrange)) + { LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } - if (ptr->ReadBinary(sizeof(m_enableDataCompression), reinterpret_cast(&m_enableDataCompression)) != sizeof(m_enableDataCompression)) { + if (ptr->ReadBinary(sizeof(m_enableDataCompression), reinterpret_cast(&m_enableDataCompression)) != sizeof(m_enableDataCompression)) + { LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } - if (ptr->ReadBinary(sizeof(m_enableDictTraining), reinterpret_cast(&m_enableDictTraining)) != sizeof(m_enableDictTraining)) { + if (ptr->ReadBinary(sizeof(m_enableDictTraining), reinterpret_cast(&m_enableDictTraining)) != sizeof(m_enableDictTraining)) + { LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } @@ -923,6 +955,7 @@ namespace SPTAG void OutputSSDIndexFile(const std::string& p_outputFile, bool m_enableDeltaEncoding, + bool m_enablePostingListRearrange, bool m_enableDataCompression, bool m_enableDictTraining, size_t p_spacePerVector, @@ -957,15 +990,15 @@ namespace SPTAG std::uint64_t listOffset = sizeof(int) * 4; // meta size of the posting lists listOffset += (sizeof(size_t) + sizeof(int) + sizeof(std::uint16_t) + sizeof(int) + sizeof(std::uint16_t)) * p_postingListSizes.size(); - - listOffset += sizeof(bool) * 3; + + listOffset += sizeof(bool) * 4; // compression dict if (m_enableDataCompression && m_enableDictTraining) { listOffset += sizeof(size_t); listOffset += m_pCompressor->GetDictBuffer().size(); } - + std::unique_ptr paddingVals(new char[PageSize]); memset(paddingVals.get(), 0, sizeof(char) * PageSize); // paddingSize: bytes left in the last page @@ -1055,36 +1088,46 @@ namespace SPTAG } } // m_enableDeltaEncoding, - if (ptr->WriteBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) { + if (ptr->WriteBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) + { + LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); + exit(1); + } + if (ptr->WriteBinary(sizeof(m_enablePostingListRearrange), reinterpret_cast(&m_enablePostingListRearrange)) != sizeof(m_enablePostingListRearrange)) + { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } - if (ptr->WriteBinary(sizeof(m_enableDataCompression), reinterpret_cast(&m_enableDataCompression)) != sizeof(m_enableDataCompression)) { + if (ptr->WriteBinary(sizeof(m_enableDataCompression), reinterpret_cast(&m_enableDataCompression)) != sizeof(m_enableDataCompression)) + { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } - if (ptr->WriteBinary(sizeof(m_enableDictTraining), reinterpret_cast(&m_enableDictTraining)) != sizeof(m_enableDictTraining)) { + if (ptr->WriteBinary(sizeof(m_enableDictTraining), reinterpret_cast(&m_enableDictTraining)) != sizeof(m_enableDictTraining)) + { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } - // compression dict + // compression dict if (m_enableDataCompression && m_enableDictTraining) { std::string dictBuffer = m_pCompressor->GetDictBuffer(); // dict size size_t dictBufferSize = dictBuffer.size(); - if (ptr->WriteBinary(sizeof(size_t), reinterpret_cast(&dictBufferSize)) != sizeof(dictBufferSize)) { + if (ptr->WriteBinary(sizeof(size_t), reinterpret_cast(&dictBufferSize)) != sizeof(dictBufferSize)) + { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } // dict - if (ptr->WriteBinary(dictBuffer.size(), const_cast(dictBuffer.data())) != dictBuffer.size()) { + if (ptr->WriteBinary(dictBuffer.size(), const_cast(dictBuffer.data())) != dictBuffer.size()) + { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } } - // Write padding vals + // Write padding vals if (paddingSize > 0) { if (ptr->WriteBinary(paddingSize, reinterpret_cast(paddingVals.get())) != paddingSize) { @@ -1132,32 +1175,36 @@ namespace SPTAG listOffset = targetOffset; } - if (p_postingListSizes[id]==0) { + if (p_postingListSizes[id] == 0) + { continue; } int postingListId = id + (int)p_postingListOffset; // get posting list full content and write it at once - ValueType* headVector = nullptr; + ValueType *headVector = nullptr; if (m_enableDeltaEncoding) { - headVector = (ValueType*)p_headIndex->GetSample(postingListId); + headVector = (ValueType *)p_headIndex->GetSample(postingListId); } std::string postingListFullData = GetPostingListFullData( - postingListId, p_postingListSizes[id], p_postingSelections, p_fullVectors, m_enableDeltaEncoding, headVector); + postingListId, p_postingListSizes[id], p_postingSelections, p_fullVectors, m_enableDeltaEncoding, m_enablePostingListRearrange, headVector); size_t postingListFullSize = p_postingListSizes[id] * p_spacePerVector; - if (postingListFullSize != postingListFullData.size()) { + if (postingListFullSize != postingListFullData.size()) + { LOG(Helper::LogLevel::LL_Error, "posting list full data size NOT MATCH! postingListFullData.size(): %zu postingListFullSize: %zu \n", postingListFullData.size(), postingListFullSize); exit(1); } - if (m_enableDataCompression) { - std::string compressedData = m_pCompressor->Compress(postingListFullData, m_enableDictTraining); + if (m_enableDataCompression) + { + std::string compressedData = m_pCompressor->Compress(postingListFullData, m_enableDictTraining); size_t compressedSize = compressedData.size(); if (compressedSize != p_postingListBytes[id]) { LOG(Helper::LogLevel::LL_Error, "Compressed size NOT MATCH! compressed size:%zu, pre-calculated compressed size:%zu\n", compressedSize, p_postingListBytes[id]); exit(1); } - if (ptr->WriteBinary(compressedSize, compressedData.data()) != compressedSize) { + if (ptr->WriteBinary(compressedSize, compressedData.data()) != compressedSize) + { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } @@ -1165,7 +1212,8 @@ namespace SPTAG } else { - if (ptr->WriteBinary(postingListFullSize, postingListFullData.data()) != postingListFullSize) { + if (ptr->WriteBinary(postingListFullSize, postingListFullData.data()) != postingListFullSize) + { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } @@ -1186,7 +1234,8 @@ namespace SPTAG if (paddingSize > 0) { - if (ptr->WriteBinary(paddingSize, reinterpret_cast(paddingVals.get())) != paddingSize) { + if (ptr->WriteBinary(paddingSize, reinterpret_cast(paddingVals.get())) != paddingSize) + { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } @@ -1208,6 +1257,7 @@ namespace SPTAG std::vector> m_indexFiles; std::unique_ptr m_pCompressor; bool m_enableDeltaEncoding; + bool m_enablePostingListRearrange; bool m_enableDataCompression; bool m_enableDictTraining; diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index 4306dbc4..41961833 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -80,6 +80,7 @@ namespace SPTAG { bool m_buildSsdIndex; int m_iSSDNumberOfThreads; bool m_enableDeltaEncoding; + bool m_enablePostingListRearrange; bool m_enableDataCompression; bool m_enableDictTraining; int m_zstdCompressLevel; diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index 2904a19f..ab93db8b 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -78,6 +78,7 @@ DefineSSDParameter(m_enableSSD, bool, false, "isExecute") DefineSSDParameter(m_buildSsdIndex, bool, false, "BuildSsdIndex") DefineSSDParameter(m_iSSDNumberOfThreads, int, 16, "NumberOfThreads") DefineSSDParameter(m_enableDeltaEncoding, bool, false, "EnableDeltaEncoding") +DefineSSDParameter(m_enablePostingListRearrange, bool, false, "EnablePostingListRearrange") DefineSSDParameter(m_enableDataCompression, bool, false, "EnableDataCompression") DefineSSDParameter(m_enableDictTraining, bool, true, "EnableDictTraining") DefineSSDParameter(m_zstdCompressLevel, int, 0, "ZstdCompressLevel") From f502072cd8fc05a689db3667c2d39dc17d04570b Mon Sep 17 00:00:00 2001 From: Guoxin Date: Tue, 31 May 2022 08:01:43 +0800 Subject: [PATCH 08/25] config minDictTraingBufferSize and dictBufferCapacity (#8) --- AnnService/inc/Core/SPANN/Compressor.h | 71 ++++++++++--------- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 12 ++-- AnnService/inc/Core/SPANN/Options.h | 2 + .../inc/Core/SPANN/ParameterDefinitionList.h | 2 + 4 files changed, 46 insertions(+), 41 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Compressor.h b/AnnService/inc/Core/SPANN/Compressor.h index 133868bc..07cbc24c 100644 --- a/AnnService/inc/Core/SPANN/Compressor.h +++ b/AnnService/inc/Core/SPANN/Compressor.h @@ -15,38 +15,7 @@ namespace SPTAG { class Compressor { - public: - Compressor(int level = 0, int bufferCapacity = 102400) - { - compress_level = level; - dictBufferCapacity = bufferCapacity; - } - - virtual ~Compressor() {} - - void TrainDict(std::string samplesBuffer, const size_t *samplesSizes, unsigned nbSamples) - { - dictBuffer.resize(dictBufferCapacity); - size_t dictSize = ZDICT_trainFromBuffer((void *)dictBuffer.data(), dictBufferCapacity, (void *)samplesBuffer.data(), &samplesSizes[0], nbSamples); - if (ZDICT_isError(dictSize)) - { - LOG(Helper::LogLevel::LL_Error, "ZDICT_trainFromBuffer() failed: %s \n", ZDICT_getErrorName(dictSize)); - exit(1); - } - dictBuffer.resize(dictSize); - dictBuffer.shrink_to_fit(); - } - - std::string GetDictBuffer() - { - return dictBuffer; - } - - void SetDictBuffer(std::string buffer) - { - dictBuffer = buffer; - } - + private: void CreateCDict() { cdict = ZSTD_createCDict((void *)dictBuffer.data(), dictBuffer.size(), compress_level); @@ -152,7 +121,6 @@ namespace SPTAG exit(1); } std::string dst{}; - est_decomp_size *= 10; dst.resize(est_decomp_size); size_t const decomp_size = ZSTD_decompress( (void *)dst.data(), est_decomp_size, src, srcSize); @@ -167,6 +135,43 @@ namespace SPTAG return dst; } + public: + Compressor(int level = 0, int bufferCapacity = 102400) + { + compress_level = level; + dictBufferCapacity = bufferCapacity; + } + + virtual ~Compressor() {} + + std::size_t TrainDict(std::string samplesBuffer, const size_t *samplesSizes, unsigned nbSamples) + { + dictBuffer.resize(dictBufferCapacity); + size_t dictSize = ZDICT_trainFromBuffer((void *)dictBuffer.data(), dictBufferCapacity, (void *)samplesBuffer.data(), &samplesSizes[0], nbSamples); + if (ZDICT_isError(dictSize)) + { + LOG(Helper::LogLevel::LL_Error, "ZDICT_trainFromBuffer() failed: %s \n", ZDICT_getErrorName(dictSize)); + exit(1); + } + dictBuffer.resize(dictSize); + dictBuffer.shrink_to_fit(); + + CreateCDict(); + + return dictSize; + } + + std::string GetDictBuffer() + { + return dictBuffer; + } + + void SetDictBuffer(std::string buffer) + { + dictBuffer = buffer; + CreateDDict(); + } + std::string Compress(const std::string &src, const bool useDict) { return useDict ? CompressWithDict(src) : CompressWithoutDict(src); diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 5c057465..36f9e80e 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -572,7 +572,7 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Info, "EnableDeltaEncoding: %s\n", p_opt.m_enableDeltaEncoding ? "true" : "false"); LOG(Helper::LogLevel::LL_Info, "EnableDataCompression: %s, ZstdCompressLevel: %d\n", p_opt.m_enableDataCompression ? "true" : "false", p_opt.m_zstdCompressLevel); std::vector postingListBytes(headVectorIDS.size()); - m_pCompressor = std::make_unique(p_opt.m_zstdCompressLevel); + m_pCompressor = std::make_unique(p_opt.m_zstdCompressLevel, p_opt.m_dictBufferCapacity); if (p_opt.m_enableDataCompression) { LOG(Helper::LogLevel::LL_Info, "Getting compressed size of each posting list...\n"); @@ -582,9 +582,7 @@ namespace SPTAG std::string samplesBuffer(""); std::vector samplesSizes; for (int i = 0; i < postingListSize.size(); i++) { - // do not compress if no data if (postingListSize[i] == 0) { - postingListBytes[i] = 0; continue; } ValueType* headVector = nullptr; @@ -597,12 +595,11 @@ namespace SPTAG samplesBuffer += postingListFullData; samplesSizes.push_back(postingListFullData.size()); - if (samplesBuffer.size() > 102400) break; + if (samplesBuffer.size() > p_opt.m_minDictTraingBufferSize) break; } LOG(Helper::LogLevel::LL_Info, "Using the first %zu postingLists to train dictionary... \n", samplesSizes.size()); - m_pCompressor->TrainDict(samplesBuffer, &samplesSizes[0], samplesSizes.size()); - m_pCompressor->CreateCDict(); - LOG(Helper::LogLevel::LL_Info, "Dictionary trained.\n"); + std::size_t dictSize = m_pCompressor->TrainDict(samplesBuffer, &samplesSizes[0], samplesSizes.size()); + LOG(Helper::LogLevel::LL_Info, "Dictionary trained, dictionary size: %zu \n", dictSize); // TODO: omp parallel for (int i = 0; i < postingListSize.size(); i++) { @@ -842,7 +839,6 @@ namespace SPTAG } m_pCompressor->SetDictBuffer(std::string(dictBuffer, dictBufferSize)); delete[] dictBuffer; - m_pCompressor->CreateDDict(); } LOG(Helper::LogLevel::LL_Info, diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index 41961833..7c726c0a 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -83,6 +83,8 @@ namespace SPTAG { bool m_enablePostingListRearrange; bool m_enableDataCompression; bool m_enableDictTraining; + int m_minDictTraingBufferSize; + int m_dictBufferCapacity; int m_zstdCompressLevel; // Building diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index ab93db8b..7e3704d6 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -81,6 +81,8 @@ DefineSSDParameter(m_enableDeltaEncoding, bool, false, "EnableDeltaEncoding") DefineSSDParameter(m_enablePostingListRearrange, bool, false, "EnablePostingListRearrange") DefineSSDParameter(m_enableDataCompression, bool, false, "EnableDataCompression") DefineSSDParameter(m_enableDictTraining, bool, true, "EnableDictTraining") +DefineSSDParameter(m_minDictTraingBufferSize, int, 10240000, "MinDictTrainingBufferSize") +DefineSSDParameter(m_dictBufferCapacity, int, 204800, "DictBufferCapacity") DefineSSDParameter(m_zstdCompressLevel, int, 0, "ZstdCompressLevel") // Building From 223bdd3daa3115f3f686a61524f9adb5c997e9fd Mon Sep 17 00:00:00 2001 From: Guoxin Date: Tue, 31 May 2022 13:05:54 +0800 Subject: [PATCH 09/25] cmake with local installed zstd (#9) --- AnnService/CMakeLists.txt | 6 ++++-- CMakeLists.txt | 15 +++------------ Dockerfile | 3 --- Dockerfile.cuda | 1 + 4 files changed, 8 insertions(+), 17 deletions(-) diff --git a/AnnService/CMakeLists.txt b/AnnService/CMakeLists.txt index b8d6525e..470f7d65 100644 --- a/AnnService/CMakeLists.txt +++ b/AnnService/CMakeLists.txt @@ -2,8 +2,10 @@ # Licensed under the MIT License. set(AnnService ${PROJECT_SOURCE_DIR}/AnnService) +set(Zstd ${PROJECT_SOURCE_DIR}/ThirdParty/zstd) include_directories(${AnnService}) +include_directories(${Zstd}/lib) file(GLOB_RECURSE HDR_FILES ${AnnService}/inc/Core/*.h ${AnnService}/inc/Helper/*.h) file(GLOB_RECURSE SRC_FILES ${AnnService}/src/Core/*.cpp ${AnnService}/src/Helper/*.cpp) @@ -32,9 +34,9 @@ if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") endif() add_library (SPTAGLib SHARED ${SRC_FILES} ${HDR_FILES}) -target_link_libraries (SPTAGLib DistanceUtils zstd::libzstd_shared) +target_link_libraries (SPTAGLib DistanceUtils libzstd_shared) add_library (SPTAGLibStatic STATIC ${SRC_FILES} ${HDR_FILES}) -target_link_libraries (SPTAGLibStatic DistanceUtils zstd::libzstd_static) +target_link_libraries (SPTAGLibStatic DistanceUtils libzstd_static) if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") target_compile_options(SPTAGLibStatic PRIVATE -fPIC) endif() diff --git a/CMakeLists.txt b/CMakeLists.txt index af3148a4..f1b97a45 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,20 +90,11 @@ else() message (FATAL_ERROR "Could not find Boost >= 1.67!") endif() -find_package(zstd) -if (zstd_FOUND) - include_directories (${zstd_INCLUDE_DIR}) - link_directories (${zstd_LIBRARY_DIR}) - message (STATUS "Found zstd.") - message (STATUS "Include Path: ${zstd_INCLUDE_DIR}") - message (STATUS "Library Path: ${zstd_LIBRARY_DIR}") - message (STATUS "Library: ${zstd_LIBRARIES}") -else() - message (FATAL_ERROR "Could not find zstd") -endif() - option(GPU "GPU" ON) option(LIBRARYONLY "LIBRARYONLY" OFF) + +add_subdirectory (ThirdParty/zstd/build/cmake) + add_subdirectory (AnnService) add_subdirectory (Test) add_subdirectory (GPUSupport) diff --git a/Dockerfile b/Dockerfile index ef8be89c..f2a9f208 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,9 +14,6 @@ COPY AnnService ./AnnService/ COPY Test ./Test/ COPY Wrappers ./Wrappers/ COPY GPUSupport ./GPUSupport/ - -# install zstd COPY ThirdParty ./ThirdParty/ -RUN cd ThirdParty/zstd/build/cmake && rm -rf builddir && mkdir builddir && cd builddir && cmake .. && make -j$(nproc) && make install RUN mkdir build && cd build && cmake .. && make -j$(nproc) diff --git a/Dockerfile.cuda b/Dockerfile.cuda index 6e984fdf..4d6bf5ee 100644 --- a/Dockerfile.cuda +++ b/Dockerfile.cuda @@ -14,5 +14,6 @@ COPY AnnService ./AnnService/ COPY Test ./Test/ COPY Wrappers ./Wrappers/ COPY GPUSupport ./GPUSupport/ +COPY ThirdParty ./ThirdParty/ RUN mkdir build && cd build && cmake .. && make -j && cd .. From e23174a7f1d5198820a70c3bfd293ad0ce625e64 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Tue, 31 May 2022 14:43:25 +0800 Subject: [PATCH 10/25] refine visual studio config (#10) --- AnnService/CoreLibrary.vcxproj | 12 ++++++------ AnnService/SSDServing.vcxproj | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/AnnService/CoreLibrary.vcxproj b/AnnService/CoreLibrary.vcxproj index e562ee8e..b7328bbd 100644 --- a/AnnService/CoreLibrary.vcxproj +++ b/AnnService/CoreLibrary.vcxproj @@ -99,10 +99,10 @@ ProgramDatabase /Zc:twoPhase- /Zc:__cplusplus %(AdditionalOptions) stdcpp17 - C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\include + $(SolutionDir)\vcpkg\installed\x64-windows\include - C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\lib\zstd.lib + $(SolutionDir)\vcpkg\installed\x64-windows\lib\zstd.lib @@ -123,7 +123,7 @@ Level3 - Disabled + MaxSpeed true true true @@ -132,17 +132,17 @@ _MBCS;_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) /Zc:twoPhase- /Zc:__cplusplus %(AdditionalOptions) Default - C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\include + $(SolutionDir)\vcpkg\installed\x64-windows\include true true - C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\lib\zstd.lib + $(SolutionDir)\vcpkg\installed\x64-windows\lib\zstd.lib - XCOPY C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\bin "$(TargetDir)" /D /K /Y + XCOPY $(SolutionDir)\vcpkg\installed\x64-windows\bin "$(TargetDir)" /D /K /Y diff --git a/AnnService/SSDServing.vcxproj b/AnnService/SSDServing.vcxproj index 29815d13..bfec997b 100644 --- a/AnnService/SSDServing.vcxproj +++ b/AnnService/SSDServing.vcxproj @@ -54,7 +54,7 @@ MultiByte - Application + StaticLibrary false v142 true @@ -152,7 +152,7 @@ true _$(OutputType);_MBCS;_SCL_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true - C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\include;%(AdditionalIncludeDirectories) + $(SolutionDir)\vcpkg\installed\x64-windows\include;%(AdditionalIncludeDirectories) NotUsing inc/SSDServing/Common/stdafx.h true @@ -166,10 +166,10 @@ true true %(AdditionalLibraryDirectories) - CoreLibrary.lib;C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\lib\zstd.lib;%(AdditionalDependencies) + CoreLibrary.lib;$(SolutionDir)\vcpkg\installed\x64-windows\lib\zstd.lib;%(AdditionalDependencies) - XCOPY C:\Users\guoxintest\source\repos\vcpkg\installed\x64-windows\bin "$(TargetDir)" /D /K /Y + XCOPY $(SolutionDir)\vcpkg\installed\x64-windows\bin "$(TargetDir)" /D /K /Y From d6e7f417aaea542b0ae4ccdc623b0b8104045d88 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Tue, 31 May 2022 17:18:49 +0800 Subject: [PATCH 11/25] parallel for get compressed size (#11) --- AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 36f9e80e..0ca80385 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -601,7 +601,7 @@ namespace SPTAG std::size_t dictSize = m_pCompressor->TrainDict(samplesBuffer, &samplesSizes[0], samplesSizes.size()); LOG(Helper::LogLevel::LL_Info, "Dictionary trained, dictionary size: %zu \n", dictSize); - // TODO: omp parallel +#pragma omp parallel for schedule(dynamic) for (int i = 0; i < postingListSize.size(); i++) { // do not compress if no data if (postingListSize[i] == 0) { From 76037b32801c6471d2e39e07d6dfcc9442580b38 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Wed, 1 Jun 2022 16:13:23 +0800 Subject: [PATCH 12/25] fix check truth bug (#12) --- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 0ca80385..02d3f3ef 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -278,21 +278,20 @@ namespace SPTAG ListInfo* listInfo = &(m_listInfos[curPostingID / m_listPerFile][curPostingID % m_listPerFile]); char* buffer = (char*)((p_exWorkSpace->m_pageBuffers[pi]).GetBuffer()); - char* p_postingListFullData; + char* p_postingListFullData = buffer + listInfo->pageOffset; + std::string postingListFullData(""); if (m_enableDataCompression) { - std::string postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, m_enableDictTraining); - p_postingListFullData = const_cast(postingListFullData.c_str()); - } - else - { - p_postingListFullData = buffer + listInfo->pageOffset; + if (listInfo->listEleCount != 0) + { + postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, m_enableDictTraining); + } + p_postingListFullData = &postingListFullData[0]; } for (size_t i = 0; i < listInfo->listEleCount; ++i) { - char* vectorInfo = p_postingListFullData + i * m_vectorInfoSize; - int vectorID = *(reinterpret_cast(vectorInfo)); - + uint64_t offsetVectorID = m_enablePostingListRearrange ? (m_vectorInfoSize - sizeof(int)) * listInfo->listEleCount + sizeof(int) * i : m_vectorInfoSize * i; \ + int vectorID = *(reinterpret_cast(p_postingListFullData + offsetVectorID)); \ LOG(Helper::LogLevel::LL_Info, "vectorID: %d\n", vectorID); if (truth && truth->count(vectorID)) (*found)[curPostingID].insert(vectorID); @@ -578,7 +577,6 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Info, "Getting compressed size of each posting list...\n"); LOG(Helper::LogLevel::LL_Info, "Training dictionary...\n"); - unsigned nbSamples = 0; std::string samplesBuffer(""); std::vector samplesSizes; for (int i = 0; i < postingListSize.size(); i++) { From 138b8da98a5eb304e8e7fd08174e75b9bd8d0c48 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Wed, 1 Jun 2022 16:57:55 +0800 Subject: [PATCH 13/25] change zstd branch (#13) --- .gitmodules | 1 + ThirdParty/zstd | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index a84eabe0..a6fa563c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "ThirdParty/zstd"] path = ThirdParty/zstd url = https://github.com/facebook/zstd + branch = release diff --git a/ThirdParty/zstd b/ThirdParty/zstd index 9a5e73c7..e47e674c 160000 --- a/ThirdParty/zstd +++ b/ThirdParty/zstd @@ -1 +1 @@ -Subproject commit 9a5e73c74ef2d621992154306ab1ab6ba44ac8fa +Subproject commit e47e674cd09583ff0503f0f6defd6d23d8b718d3 From 161e7a8f17757c5553aa689c18f4200af1e4d61d Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Wed, 1 Jun 2022 12:23:18 +0000 Subject: [PATCH 14/25] remove verbose log in truth analysis; refine dockerfile --- AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h | 4 +--- Dockerfile | 2 +- Dockerfile.cuda | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 02d3f3ef..97c0dfe8 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -270,7 +270,7 @@ namespace SPTAG } #endif #endif - if (truth) { // TODO: check truth + if (truth) { for (uint32_t pi = 0; pi < postingListCount; ++pi) { auto curPostingID = p_exWorkSpace->m_postingIDs[pi]; @@ -292,8 +292,6 @@ namespace SPTAG for (size_t i = 0; i < listInfo->listEleCount; ++i) { uint64_t offsetVectorID = m_enablePostingListRearrange ? (m_vectorInfoSize - sizeof(int)) * listInfo->listEleCount + sizeof(int) * i : m_vectorInfoSize * i; \ int vectorID = *(reinterpret_cast(p_postingListFullData + offsetVectorID)); \ - LOG(Helper::LogLevel::LL_Info, "vectorID: %d\n", vectorID); - if (truth && truth->count(vectorID)) (*found)[curPostingID].insert(vectorID); } } diff --git a/Dockerfile b/Dockerfile index f2a9f208..00ec1f84 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,4 +16,4 @@ COPY Wrappers ./Wrappers/ COPY GPUSupport ./GPUSupport/ COPY ThirdParty ./ThirdParty/ -RUN mkdir build && cd build && cmake .. && make -j$(nproc) +RUN mkdir build && cd build && cmake .. && make -j$(nproc) && cd .. diff --git a/Dockerfile.cuda b/Dockerfile.cuda index 4d6bf5ee..bfd26510 100644 --- a/Dockerfile.cuda +++ b/Dockerfile.cuda @@ -16,4 +16,4 @@ COPY Wrappers ./Wrappers/ COPY GPUSupport ./GPUSupport/ COPY ThirdParty ./ThirdParty/ -RUN mkdir build && cd build && cmake .. && make -j && cd .. +RUN mkdir build && cd build && cmake .. && -j$(nproc) && cd .. From 063c847befd7cc3827a749322eb8d0b2bcd70030 Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Sun, 5 Jun 2022 08:49:37 +0000 Subject: [PATCH 15/25] check rvalue in Compressor.h --- AnnService/inc/Core/SPANN/Compressor.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Compressor.h b/AnnService/inc/Core/SPANN/Compressor.h index 07cbc24c..06852361 100644 --- a/AnnService/inc/Core/SPANN/Compressor.h +++ b/AnnService/inc/Core/SPANN/Compressor.h @@ -140,11 +140,13 @@ namespace SPTAG { compress_level = level; dictBufferCapacity = bufferCapacity; + cdict = nullptr; + ddict = nullptr; } virtual ~Compressor() {} - std::size_t TrainDict(std::string samplesBuffer, const size_t *samplesSizes, unsigned nbSamples) + std::size_t TrainDict(const std::string &samplesBuffer, const size_t *samplesSizes, unsigned nbSamples) { dictBuffer.resize(dictBufferCapacity); size_t dictSize = ZDICT_trainFromBuffer((void *)dictBuffer.data(), dictBufferCapacity, (void *)samplesBuffer.data(), &samplesSizes[0], nbSamples); @@ -166,7 +168,7 @@ namespace SPTAG return dictBuffer; } - void SetDictBuffer(std::string buffer) + void SetDictBuffer(const std::string &buffer) { dictBuffer = buffer; CreateDDict(); From 4d692ed261d192bd3c110b51e2d82bd99fca305b Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Tue, 7 Jun 2022 11:12:45 +0000 Subject: [PATCH 16/25] resolve back compatibility by add a search option: WithDataCompressionFeatures --- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 95 +++++++++++-------- AnnService/inc/Core/SPANN/Options.h | 1 + .../inc/Core/SPANN/ParameterDefinitionList.h | 1 + 3 files changed, 60 insertions(+), 37 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 97c0dfe8..888d2521 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -97,6 +97,10 @@ namespace SPTAG public: ExtraFullGraphSearcher() { + m_enableDeltaEncoding = false; + m_enablePostingListRearrange = false; + m_enableDataCompression = false; + m_enableDictTraining = false; } virtual ~ExtraFullGraphSearcher() @@ -125,7 +129,7 @@ namespace SPTAG m_indexFiles.emplace_back(curIndexFile); m_listInfos.emplace_back(0); - m_totalListCount += LoadingHeadInfo(curFile, p_opt.m_searchPostingPageLimit, m_listInfos.back()); + m_totalListCount += LoadingHeadInfo(curFile, p_opt.m_searchPostingPageLimit, m_listInfos.back(), p_opt.m_withDataCompressionFeatures); curFile = m_extraFullGraphFile + "_" + std::to_string(m_indexFiles.size()); } while (fileexists(curFile.c_str())); @@ -708,7 +712,7 @@ namespace SPTAG std::uint16_t pageOffset = 0; }; - int LoadingHeadInfo(const std::string& p_file, int p_postingPageLimit, std::vector& m_listInfos) + int LoadingHeadInfo(const std::string& p_file, int p_postingPageLimit, std::vector& m_listInfos, bool m_withDataCompressionFeatures) { auto ptr = SPTAG::f_createIO(); if (ptr == nullptr || !ptr->Initialize(p_file.c_str(), std::ios::binary | std::ios::in)) { @@ -756,9 +760,12 @@ namespace SPTAG int pageNum; for (int i = 0; i < m_listCount; ++i) { - if (ptr->ReadBinary(sizeof(m_listInfos[i].listTotalBytes), reinterpret_cast(&(m_listInfos[i].listTotalBytes))) != sizeof(m_listInfos[i].listTotalBytes)) { - LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); - exit(1); + if (m_withDataCompressionFeatures) + { + if (ptr->ReadBinary(sizeof(m_listInfos[i].listTotalBytes), reinterpret_cast(&(m_listInfos[i].listTotalBytes))) != sizeof(m_listInfos[i].listTotalBytes)) { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } } if (ptr->ReadBinary(sizeof(pageNum), reinterpret_cast(&(pageNum))) != sizeof(pageNum)) { LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); @@ -776,10 +783,12 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } + if (!m_withDataCompressionFeatures) + { + m_listInfos[i].listTotalBytes = m_listInfos[i].listEleCount * m_vectorInfoSize; + } m_listInfos[i].listOffset = (static_cast(m_listPageOffset + pageNum) << PageSizeEx); - // m_listInfos[i].listEleCount = min(m_listInfos[i].listEleCount, (min(static_cast(m_listInfos[i].listPageCount), p_postingPageLimit) << PageSizeEx) / m_vectorInfoSize); - // m_listInfos[i].listPageCount = static_cast(ceil((m_vectorInfoSize * m_listInfos[i].listEleCount + m_listInfos[i].pageOffset) * 1.0 / (1 << PageSizeEx))); totalListElementCount += m_listInfos[i].listEleCount; int pageCount = m_listInfos[i].listPageCount; @@ -799,42 +808,54 @@ namespace SPTAG } } - if (ptr->ReadBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) - { - LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); - exit(1); - } - if (ptr->ReadBinary(sizeof(m_enablePostingListRearrange), reinterpret_cast(&m_enablePostingListRearrange)) != sizeof(m_enablePostingListRearrange)) - { - LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); - exit(1); - } - if (ptr->ReadBinary(sizeof(m_enableDataCompression), reinterpret_cast(&m_enableDataCompression)) != sizeof(m_enableDataCompression)) - { - LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); - exit(1); - } - if (ptr->ReadBinary(sizeof(m_enableDictTraining), reinterpret_cast(&m_enableDictTraining)) != sizeof(m_enableDictTraining)) - { - LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); - exit(1); - } - - if (m_enableDataCompression && m_enableDictTraining) + if (m_withDataCompressionFeatures) { - size_t dictBufferSize; - - if (ptr->ReadBinary(sizeof(size_t), reinterpret_cast(&dictBufferSize)) != sizeof(dictBufferSize)) { + if (ptr->ReadBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) + { LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } - char* dictBuffer = new char[dictBufferSize]; - if (ptr->ReadBinary(dictBufferSize, dictBuffer) != dictBufferSize) { + if (ptr->ReadBinary(sizeof(m_enablePostingListRearrange), reinterpret_cast(&m_enablePostingListRearrange)) != sizeof(m_enablePostingListRearrange)) + { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } + if (ptr->ReadBinary(sizeof(m_enableDataCompression), reinterpret_cast(&m_enableDataCompression)) != sizeof(m_enableDataCompression)) + { LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } - m_pCompressor->SetDictBuffer(std::string(dictBuffer, dictBufferSize)); - delete[] dictBuffer; + if (ptr->ReadBinary(sizeof(m_enableDictTraining), reinterpret_cast(&m_enableDictTraining)) != sizeof(m_enableDictTraining)) + { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } + + if (m_enableDataCompression && m_enableDictTraining) + { + size_t dictBufferSize; + + if (ptr->ReadBinary(sizeof(size_t), reinterpret_cast(&dictBufferSize)) != sizeof(dictBufferSize)) { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } + char* dictBuffer = new char[dictBufferSize]; + if (ptr->ReadBinary(dictBufferSize, dictBuffer) != dictBufferSize) { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); + exit(1); + } + m_pCompressor->SetDictBuffer(std::string(dictBuffer, dictBufferSize)); + delete[] dictBuffer; + } + } + + if (!m_enableDataCompression) + { + for (int i = 0; i < m_listCount; ++i) + { + m_listInfos[i].listEleCount = min(m_listInfos[i].listEleCount, (min(static_cast(m_listInfos[i].listPageCount), p_postingPageLimit) << PageSizeEx) / m_vectorInfoSize); + m_listInfos[i].listPageCount = static_cast(ceil((m_vectorInfoSize * m_listInfos[i].listEleCount + m_listInfos[i].pageOffset) * 1.0 / (1 << PageSizeEx))); + } } LOG(Helper::LogLevel::LL_Info, @@ -1079,7 +1100,7 @@ namespace SPTAG exit(1); } } - // m_enableDeltaEncoding, + if (ptr->WriteBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index 7c726c0a..be96e483 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -104,6 +104,7 @@ namespace SPTAG { int m_numGPUs; // Searching + bool m_withDataCompressionFeatures; std::string m_searchResult; std::string m_logFile; int m_qpsLimit; diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index 7e3704d6..c2869974 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -102,6 +102,7 @@ DefineSSDParameter(m_gpuSSDLeafSize, int, 200, "GPUSSDLeafSize") DefineSSDParameter(m_numGPUs, int, 1, "NumGPUs") // Searching +DefineSSDParameter(m_withDataCompressionFeatures, bool, false, "WithDataCompressionFeatures") DefineSSDParameter(m_searchResult, std::string, std::string(""), "SearchResult") DefineSSDParameter(m_logFile, std::string, std::string(""), "LogFile") DefineSSDParameter(m_qpsLimit, int, 0, "QpsLimit") From aaa1fbe2645eae1f6d642af4291b534d36d3526b Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Wed, 8 Jun 2022 02:21:51 +0000 Subject: [PATCH 17/25] remove redundant configs, change head info format only when compression enabled --- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 91 ++++++------------- AnnService/inc/Core/SPANN/Options.h | 1 - .../inc/Core/SPANN/ParameterDefinitionList.h | 1 - 3 files changed, 26 insertions(+), 67 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 888d2521..cff60ffd 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -100,7 +100,7 @@ namespace SPTAG m_enableDeltaEncoding = false; m_enablePostingListRearrange = false; m_enableDataCompression = false; - m_enableDictTraining = false; + m_enableDictTraining = true; } virtual ~ExtraFullGraphSearcher() @@ -129,7 +129,11 @@ namespace SPTAG m_indexFiles.emplace_back(curIndexFile); m_listInfos.emplace_back(0); - m_totalListCount += LoadingHeadInfo(curFile, p_opt.m_searchPostingPageLimit, m_listInfos.back(), p_opt.m_withDataCompressionFeatures); + m_enableDeltaEncoding = p_opt.m_enableDeltaEncoding; + m_enablePostingListRearrange = p_opt.m_enablePostingListRearrange; + m_enableDataCompression = p_opt.m_enableDataCompression; + m_enableDictTraining = p_opt.m_enableDictTraining; + m_totalListCount += LoadingHeadInfo(curFile, p_opt.m_searchPostingPageLimit, m_listInfos.back()); curFile = m_extraFullGraphFile + "_" + std::to_string(m_indexFiles.size()); } while (fileexists(curFile.c_str())); @@ -570,12 +574,11 @@ namespace SPTAG if (p_opt.m_distCalcMethod == DistCalcMethod::Cosine && !p_reader->IsNormalized() && !p_headIndex->m_pQuantizer) fullVectors->Normalize(p_opt.m_iSSDNumberOfThreads); // get compressed size of each posting list - LOG(Helper::LogLevel::LL_Info, "EnableDeltaEncoding: %s\n", p_opt.m_enableDeltaEncoding ? "true" : "false"); - LOG(Helper::LogLevel::LL_Info, "EnableDataCompression: %s, ZstdCompressLevel: %d\n", p_opt.m_enableDataCompression ? "true" : "false", p_opt.m_zstdCompressLevel); std::vector postingListBytes(headVectorIDS.size()); - m_pCompressor = std::make_unique(p_opt.m_zstdCompressLevel, p_opt.m_dictBufferCapacity); + if (p_opt.m_enableDataCompression) { + m_pCompressor = std::make_unique(p_opt.m_zstdCompressLevel, p_opt.m_dictBufferCapacity); LOG(Helper::LogLevel::LL_Info, "Getting compressed size of each posting list...\n"); LOG(Helper::LogLevel::LL_Info, "Training dictionary...\n"); @@ -712,7 +715,7 @@ namespace SPTAG std::uint16_t pageOffset = 0; }; - int LoadingHeadInfo(const std::string& p_file, int p_postingPageLimit, std::vector& m_listInfos, bool m_withDataCompressionFeatures) + int LoadingHeadInfo(const std::string& p_file, int p_postingPageLimit, std::vector& m_listInfos) { auto ptr = SPTAG::f_createIO(); if (ptr == nullptr || !ptr->Initialize(p_file.c_str(), std::ios::binary | std::ios::in)) { @@ -760,7 +763,7 @@ namespace SPTAG int pageNum; for (int i = 0; i < m_listCount; ++i) { - if (m_withDataCompressionFeatures) + if (m_enableDataCompression) { if (ptr->ReadBinary(sizeof(m_listInfos[i].listTotalBytes), reinterpret_cast(&(m_listInfos[i].listTotalBytes))) != sizeof(m_listInfos[i].listTotalBytes)) { LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); @@ -783,7 +786,7 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } - if (!m_withDataCompressionFeatures) + if (!m_enableDataCompression) { m_listInfos[i].listTotalBytes = m_listInfos[i].listEleCount * m_vectorInfoSize; } @@ -808,45 +811,20 @@ namespace SPTAG } } - if (m_withDataCompressionFeatures) + if (m_enableDataCompression && m_enableDictTraining) { - if (ptr->ReadBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) - { + size_t dictBufferSize; + if (ptr->ReadBinary(sizeof(size_t), reinterpret_cast(&dictBufferSize)) != sizeof(dictBufferSize)) { LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } - if (ptr->ReadBinary(sizeof(m_enablePostingListRearrange), reinterpret_cast(&m_enablePostingListRearrange)) != sizeof(m_enablePostingListRearrange)) - { + char* dictBuffer = new char[dictBufferSize]; + if (ptr->ReadBinary(dictBufferSize, dictBuffer) != dictBufferSize) { LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } - if (ptr->ReadBinary(sizeof(m_enableDataCompression), reinterpret_cast(&m_enableDataCompression)) != sizeof(m_enableDataCompression)) - { - LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); - exit(1); - } - if (ptr->ReadBinary(sizeof(m_enableDictTraining), reinterpret_cast(&m_enableDictTraining)) != sizeof(m_enableDictTraining)) - { - LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); - exit(1); - } - - if (m_enableDataCompression && m_enableDictTraining) - { - size_t dictBufferSize; - - if (ptr->ReadBinary(sizeof(size_t), reinterpret_cast(&dictBufferSize)) != sizeof(dictBufferSize)) { - LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); - exit(1); - } - char* dictBuffer = new char[dictBufferSize]; - if (ptr->ReadBinary(dictBufferSize, dictBuffer) != dictBufferSize) { - LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); - exit(1); - } - m_pCompressor->SetDictBuffer(std::string(dictBuffer, dictBufferSize)); - delete[] dictBuffer; - } + m_pCompressor->SetDictBuffer(std::string(dictBuffer, dictBufferSize)); + delete[] dictBuffer; } if (!m_enableDataCompression) @@ -1002,9 +980,13 @@ namespace SPTAG // meta size of global info std::uint64_t listOffset = sizeof(int) * 4; // meta size of the posting lists - listOffset += (sizeof(size_t) + sizeof(int) + sizeof(std::uint16_t) + sizeof(int) + sizeof(std::uint16_t)) * p_postingListSizes.size(); + listOffset += (sizeof(int) + sizeof(std::uint16_t) + sizeof(int) + sizeof(std::uint16_t)) * p_postingListSizes.size(); + // write listTotalBytes only when enabled data compression + if (m_enableDataCompression) + { + listOffset += sizeof(size_t) * p_postingListSizes.size(); + } - listOffset += sizeof(bool) * 4; // compression dict if (m_enableDataCompression && m_enableDictTraining) { @@ -1074,8 +1056,8 @@ namespace SPTAG ++listPageCount; } } - // Total bytes of the posting list - if (ptr->WriteBinary(sizeof(postingListByte), reinterpret_cast(&postingListByte)) != sizeof(postingListByte)) { + // Total bytes of the posting list, write only when enabled data compression + if (m_enableDataCompression && ptr->WriteBinary(sizeof(postingListByte), reinterpret_cast(&postingListByte)) != sizeof(postingListByte)) { LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); exit(1); } @@ -1100,27 +1082,6 @@ namespace SPTAG exit(1); } } - - if (ptr->WriteBinary(sizeof(m_enableDeltaEncoding), reinterpret_cast(&m_enableDeltaEncoding)) != sizeof(m_enableDeltaEncoding)) - { - LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); - exit(1); - } - if (ptr->WriteBinary(sizeof(m_enablePostingListRearrange), reinterpret_cast(&m_enablePostingListRearrange)) != sizeof(m_enablePostingListRearrange)) - { - LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); - exit(1); - } - if (ptr->WriteBinary(sizeof(m_enableDataCompression), reinterpret_cast(&m_enableDataCompression)) != sizeof(m_enableDataCompression)) - { - LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); - exit(1); - } - if (ptr->WriteBinary(sizeof(m_enableDictTraining), reinterpret_cast(&m_enableDictTraining)) != sizeof(m_enableDictTraining)) - { - LOG(Helper::LogLevel::LL_Error, "Failed to write SSDIndex File!"); - exit(1); - } // compression dict if (m_enableDataCompression && m_enableDictTraining) { diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index be96e483..7c726c0a 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -104,7 +104,6 @@ namespace SPTAG { int m_numGPUs; // Searching - bool m_withDataCompressionFeatures; std::string m_searchResult; std::string m_logFile; int m_qpsLimit; diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index c2869974..7e3704d6 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -102,7 +102,6 @@ DefineSSDParameter(m_gpuSSDLeafSize, int, 200, "GPUSSDLeafSize") DefineSSDParameter(m_numGPUs, int, 1, "NumGPUs") // Searching -DefineSSDParameter(m_withDataCompressionFeatures, bool, false, "WithDataCompressionFeatures") DefineSSDParameter(m_searchResult, std::string, std::string(""), "SearchResult") DefineSSDParameter(m_logFile, std::string, std::string(""), "LogFile") DefineSSDParameter(m_qpsLimit, int, 0, "QpsLimit") From 4731e911804ed363597cfec2845273d4b64ba97e Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Wed, 8 Jun 2022 05:04:07 +0000 Subject: [PATCH 18/25] reuse buffer when decompression --- AnnService/inc/Core/SPANN/Compressor.h | 46 +++++-------------- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 22 ++++----- AnnService/inc/Core/SPANN/IExtraSearcher.h | 19 ++++++-- AnnService/src/Core/SPANN/SPANNIndex.cpp | 2 +- 4 files changed, 37 insertions(+), 52 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Compressor.h b/AnnService/inc/Core/SPANN/Compressor.h index 06852361..7f36ed13 100644 --- a/AnnService/inc/Core/SPANN/Compressor.h +++ b/AnnService/inc/Core/SPANN/Compressor.h @@ -61,32 +61,23 @@ namespace SPTAG return comp_buffer; } - std::string DecompressWithDict(const char *src, size_t srcSize) + std::size_t DecompressWithDict(const char* src, size_t srcSize, char* dst, size_t dstCapacity) { - auto const est_decomp_size = - ZSTD_getFrameContentSize(src, srcSize); - - std::string decomp_buffer{}; - decomp_buffer.resize(est_decomp_size); - - ZSTD_DCtx *const dctx = ZSTD_createDCtx(); + ZSTD_DCtx* const dctx = ZSTD_createDCtx(); if (dctx == NULL) { LOG(Helper::LogLevel::LL_Error, "ZSTD_createDCtx() failed! \n"); exit(1); } - size_t const decomp_size = ZSTD_decompress_usingDDict(dctx, - (void *)decomp_buffer.data(), est_decomp_size, src, srcSize, ddict); + std::size_t const decomp_size = ZSTD_decompress_usingDDict(dctx, + (void*)dst, dstCapacity, src, srcSize, ddict); if (ZSTD_isError(decomp_size)) { LOG(Helper::LogLevel::LL_Error, "ZSTD decompress error %s, \n", ZSTD_getErrorName(decomp_size)); exit(1); } - ZSTD_freeDCtx(dctx); - decomp_buffer.resize(decomp_size); - decomp_buffer.shrink_to_fit(); - return decomp_buffer; + return decomp_size; } std::string CompressWithoutDict(const std::string &src) @@ -107,32 +98,17 @@ namespace SPTAG return buffer; } - std::string DecompressWithoutDict(const char *src, size_t srcSize) + std::size_t DecompressWithoutDict(const char *src, size_t srcSize, char* dst, size_t dstCapacity) { - size_t est_decomp_size = ZSTD_getFrameContentSize(src, srcSize); - if (est_decomp_size == ZSTD_CONTENTSIZE_ERROR) - { - LOG(Helper::LogLevel::LL_Error, "not compressed by zstd!\n"); - exit(1); - } - else if (est_decomp_size == ZSTD_CONTENTSIZE_UNKNOWN) - { - LOG(Helper::LogLevel::LL_Error, "original size unknown!\n"); - exit(1); - } - std::string dst{}; - dst.resize(est_decomp_size); - size_t const decomp_size = ZSTD_decompress( - (void *)dst.data(), est_decomp_size, src, srcSize); + std::size_t const decomp_size = ZSTD_decompress( + (void *)dst, dstCapacity, src, srcSize); if (ZSTD_isError(decomp_size)) { LOG(Helper::LogLevel::LL_Error, "ZSTD decompress error %s, \n", ZSTD_getErrorName(decomp_size)); exit(1); } - dst.resize(decomp_size); - dst.shrink_to_fit(); - return dst; + return decomp_size; } public: @@ -179,9 +155,9 @@ namespace SPTAG return useDict ? CompressWithDict(src) : CompressWithoutDict(src); } - std::string Decompress(const char *src, size_t srcSize, const bool useDict) + std::size_t Decompress(const char *src, size_t srcSize, char* dst, size_t dstCapacity, const bool useDict) { - return useDict ? DecompressWithDict(src, srcSize) : DecompressWithoutDict(src, srcSize); + return useDict ? DecompressWithDict(src, srcSize, dst, dstCapacity) : DecompressWithoutDict(src, srcSize, dst, dstCapacity); } // return the compressed sie diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index cff60ffd..303460dd 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -201,26 +201,25 @@ namespace SPTAG #ifdef BATCH_READ // async batch read auto vectorInfoSize = m_vectorInfoSize; - request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, curPostingID, m_enableDeltaEncoding = m_enableDeltaEncoding, m_enablePostingListRearrange = m_enablePostingListRearrange, m_enableDictTraining = m_enableDictTraining, m_enableDataCompression = m_enableDataCompression, &m_pCompressor = m_pCompressor](Helper::AsyncReadRequest *request) + request.m_callback = [&p_exWorkSpace, pi, &queryResults, &p_index, vectorInfoSize, curPostingID, m_enableDeltaEncoding = m_enableDeltaEncoding, m_enablePostingListRearrange = m_enablePostingListRearrange, m_enableDictTraining = m_enableDictTraining, m_enableDataCompression = m_enableDataCompression, &m_pCompressor = m_pCompressor](Helper::AsyncReadRequest *request) { char* buffer = request->m_buffer; ListInfo* listInfo = (ListInfo*)(request->m_payload); // decompress posting list char* p_postingListFullData = buffer + listInfo->pageOffset; - std::string postingListFullData(""); if (m_enableDataCompression) { + p_postingListFullData = (char*)p_exWorkSpace->m_decompressBuffers[pi].GetBuffer(); if (listInfo->listEleCount != 0) { - postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, m_enableDictTraining); - } - if (postingListFullData.size() != listInfo->listEleCount * vectorInfoSize) - { - LOG(Helper::LogLevel::LL_Info, "postingListFullData size not match! %zu, %d, \n", postingListFullData.size(), listInfo->listEleCount * vectorInfoSize); - exit(1); + std::size_t sizePostingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, p_postingListFullData, listInfo->listEleCount * vectorInfoSize, m_enableDictTraining); + if (sizePostingListFullData != listInfo->listEleCount * vectorInfoSize) + { + LOG(Helper::LogLevel::LL_Info, "postingListFullData size not match! %zu, %d, \n", sizePostingListFullData, listInfo->listEleCount * vectorInfoSize); + exit(1); + } } - p_postingListFullData = &postingListFullData[0]; } // delta encoding @@ -287,14 +286,13 @@ namespace SPTAG char* buffer = (char*)((p_exWorkSpace->m_pageBuffers[pi]).GetBuffer()); char* p_postingListFullData = buffer + listInfo->pageOffset; - std::string postingListFullData(""); if (m_enableDataCompression) { + p_postingListFullData = (char*)p_exWorkSpace->m_decompressBuffers[pi].GetBuffer(); if (listInfo->listEleCount != 0) { - postingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, m_enableDictTraining); + m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, p_postingListFullData, listInfo->listEleCount * m_vectorInfoSize, m_enableDictTraining); } - p_postingListFullData = &postingListFullData[0]; } for (size_t i = 0; i < listInfo->listEleCount; ++i) { diff --git a/AnnService/inc/Core/SPANN/IExtraSearcher.h b/AnnService/inc/Core/SPANN/IExtraSearcher.h index 7def4813..55ed6c51 100644 --- a/AnnService/inc/Core/SPANN/IExtraSearcher.h +++ b/AnnService/inc/Core/SPANN/IExtraSearcher.h @@ -108,11 +108,11 @@ namespace SPTAG { ~ExtraWorkSpace() {} ExtraWorkSpace(ExtraWorkSpace& other) { - Initialize(other.m_deduper.MaxCheck(), other.m_deduper.HashTableExponent(), (int)other.m_pageBuffers.size(), (int)(other.m_pageBuffers[0].GetPageSize())); + Initialize(other.m_deduper.MaxCheck(), other.m_deduper.HashTableExponent(), (int)other.m_pageBuffers.size(), (int)(other.m_pageBuffers[0].GetPageSize()), other.m_enableDataCompression); m_spaceID = g_spaceCount++; } - void Initialize(int p_maxCheck, int p_hashExp, int p_internalResultNum, int p_maxPages) { + void Initialize(int p_maxCheck, int p_hashExp, int p_internalResultNum, int p_maxPages, bool enableDataCompression) { m_postingIDs.reserve(p_internalResultNum); m_deduper.Init(p_maxCheck, p_hashExp); m_processIocp.reset(p_internalResultNum); @@ -121,6 +121,14 @@ namespace SPTAG { m_pageBuffers[pi].ReservePageBuffer(p_maxPages); } m_diskRequests.resize(p_internalResultNum); + m_enableDataCompression = enableDataCompression; + if (enableDataCompression) { + + m_decompressBuffers.resize(p_internalResultNum); + for (int pi = 0; pi < p_internalResultNum; pi++) { + m_decompressBuffers[pi].ReservePageBuffer(p_maxPages); + } + } } void Initialize(va_list& arg) { @@ -128,7 +136,8 @@ namespace SPTAG { int hashExp = va_arg(arg, int); int internalResultNum = va_arg(arg, int); int maxPages = va_arg(arg, int); - Initialize(maxCheck, hashExp, internalResultNum, maxPages); + int enableDataCompression = va_arg(arg, bool); + Initialize(maxCheck, hashExp, internalResultNum, maxPages, enableDataCompression); } static void Reset() { g_spaceCount = 0; } @@ -141,6 +150,9 @@ namespace SPTAG { std::vector> m_pageBuffers; + bool m_enableDataCompression; + std::vector> m_decompressBuffers; + std::vector m_diskRequests; int m_spaceID; @@ -155,7 +167,6 @@ namespace SPTAG { { } - virtual ~IExtraSearcher() { } diff --git a/AnnService/src/Core/SPANN/SPANNIndex.cpp b/AnnService/src/Core/SPANN/SPANNIndex.cpp index fe8c526a..953d2974 100644 --- a/AnnService/src/Core/SPANN/SPANNIndex.cpp +++ b/AnnService/src/Core/SPANN/SPANNIndex.cpp @@ -102,7 +102,7 @@ namespace SPTAG omp_set_num_threads(m_options.m_iSSDNumberOfThreads); m_workSpacePool.reset(new COMMON::WorkSpacePool()); - m_workSpacePool->Init(m_options.m_iSSDNumberOfThreads, m_options.m_maxCheck, m_options.m_hashExp, m_options.m_searchInternalResultNum, max(m_options.m_postingPageLimit, m_options.m_searchPostingPageLimit + 1) << PageSizeEx); + m_workSpacePool->Init(m_options.m_iSSDNumberOfThreads, m_options.m_maxCheck, m_options.m_hashExp, m_options.m_searchInternalResultNum, max(m_options.m_postingPageLimit, m_options.m_searchPostingPageLimit + 1) << PageSizeEx, m_options.m_enableDataCompression); return ErrorCode::Success; } From 9bf097aa3e37e9da48873576bd5405c8849cb2b2 Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Wed, 8 Jun 2022 07:18:08 +0000 Subject: [PATCH 19/25] remove exit(1) from search index --- AnnService/inc/Core/SPANN/Compressor.h | 8 ++--- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 29 +++++++++++++++---- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Compressor.h b/AnnService/inc/Core/SPANN/Compressor.h index 7f36ed13..86100608 100644 --- a/AnnService/inc/Core/SPANN/Compressor.h +++ b/AnnService/inc/Core/SPANN/Compressor.h @@ -32,7 +32,7 @@ namespace SPTAG if (ddict == NULL) { LOG(Helper::LogLevel::LL_Error, "ZSTD_createDDict() failed! \n"); - exit(1); + throw std::runtime_error("ZSTD_createDDict() failed!"); } } @@ -67,14 +67,14 @@ namespace SPTAG if (dctx == NULL) { LOG(Helper::LogLevel::LL_Error, "ZSTD_createDCtx() failed! \n"); - exit(1); + throw std::runtime_error("ZSTD_createDCtx() failed!"); } std::size_t const decomp_size = ZSTD_decompress_usingDDict(dctx, (void*)dst, dstCapacity, src, srcSize, ddict); if (ZSTD_isError(decomp_size)) { LOG(Helper::LogLevel::LL_Error, "ZSTD decompress error %s, \n", ZSTD_getErrorName(decomp_size)); - exit(1); + throw std::runtime_error("ZSTD decompress failed."); } ZSTD_freeDCtx(dctx); return decomp_size; @@ -105,7 +105,7 @@ namespace SPTAG if (ZSTD_isError(decomp_size)) { LOG(Helper::LogLevel::LL_Error, "ZSTD decompress error %s, \n", ZSTD_getErrorName(decomp_size)); - exit(1); + throw std::runtime_error("ZSTD decompress failed."); } return decomp_size; diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 303460dd..c21f715c 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -213,11 +213,18 @@ namespace SPTAG p_postingListFullData = (char*)p_exWorkSpace->m_decompressBuffers[pi].GetBuffer(); if (listInfo->listEleCount != 0) { - std::size_t sizePostingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, p_postingListFullData, listInfo->listEleCount * vectorInfoSize, m_enableDictTraining); + std::size_t sizePostingListFullData; + try { + sizePostingListFullData = m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, p_postingListFullData, listInfo->listEleCount * vectorInfoSize, m_enableDictTraining); + } + catch (std::runtime_error &err) { + LOG(Helper::LogLevel::LL_Error, "Decompress postingList %d failed! %s, \n", curPostingID, err.what()); + return; + } if (sizePostingListFullData != listInfo->listEleCount * vectorInfoSize) { - LOG(Helper::LogLevel::LL_Info, "postingListFullData size not match! %zu, %d, \n", sizePostingListFullData, listInfo->listEleCount * vectorInfoSize); - exit(1); + LOG(Helper::LogLevel::LL_Error, "PostingList %d decompressed size not match! %zu, %d, \n", curPostingID, sizePostingListFullData, listInfo->listEleCount * vectorInfoSize); + return; } } } @@ -291,7 +298,13 @@ namespace SPTAG p_postingListFullData = (char*)p_exWorkSpace->m_decompressBuffers[pi].GetBuffer(); if (listInfo->listEleCount != 0) { - m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, p_postingListFullData, listInfo->listEleCount * m_vectorInfoSize, m_enableDictTraining); + try { + m_pCompressor->Decompress(buffer + listInfo->pageOffset, listInfo->listTotalBytes, p_postingListFullData, listInfo->listEleCount * m_vectorInfoSize, m_enableDictTraining); + } + catch (std::runtime_error& err) { + LOG(Helper::LogLevel::LL_Error, "Decompress postingList %d failed! %s, \n", curPostingID, err.what()); + continue; + } } } @@ -821,7 +834,13 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } - m_pCompressor->SetDictBuffer(std::string(dictBuffer, dictBufferSize)); + try { + m_pCompressor->SetDictBuffer(std::string(dictBuffer, dictBufferSize)); + } + catch (std::runtime_error& err) { + LOG(Helper::LogLevel::LL_Error, "Failed to read head info file: %s \n", err.what()); + exit(1); + } delete[] dictBuffer; } From bbe8a6da734f05727ee9e4508862a6f217ba3f5f Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Wed, 8 Jun 2022 12:53:57 +0000 Subject: [PATCH 20/25] bug fix: reuse compression buffer in workspace # with '#' will be ignored, and an empty message aborts the commit. --- AnnService/SSDServing.vcxproj | 2 +- AnnService/inc/Core/SPANN/IExtraSearcher.h | 3 +-- AnnService/src/Core/SPANN/SPANNIndex.cpp | 8 ++++---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/AnnService/SSDServing.vcxproj b/AnnService/SSDServing.vcxproj index bfec997b..4dd1b372 100644 --- a/AnnService/SSDServing.vcxproj +++ b/AnnService/SSDServing.vcxproj @@ -54,7 +54,7 @@ MultiByte - StaticLibrary + Application false v142 true diff --git a/AnnService/inc/Core/SPANN/IExtraSearcher.h b/AnnService/inc/Core/SPANN/IExtraSearcher.h index 55ed6c51..d6ac7a07 100644 --- a/AnnService/inc/Core/SPANN/IExtraSearcher.h +++ b/AnnService/inc/Core/SPANN/IExtraSearcher.h @@ -123,7 +123,6 @@ namespace SPTAG { m_diskRequests.resize(p_internalResultNum); m_enableDataCompression = enableDataCompression; if (enableDataCompression) { - m_decompressBuffers.resize(p_internalResultNum); for (int pi = 0; pi < p_internalResultNum; pi++) { m_decompressBuffers[pi].ReservePageBuffer(p_maxPages); @@ -136,7 +135,7 @@ namespace SPTAG { int hashExp = va_arg(arg, int); int internalResultNum = va_arg(arg, int); int maxPages = va_arg(arg, int); - int enableDataCompression = va_arg(arg, bool); + bool enableDataCompression = bool(va_arg(arg, int)); Initialize(maxCheck, hashExp, internalResultNum, maxPages, enableDataCompression); } diff --git a/AnnService/src/Core/SPANN/SPANNIndex.cpp b/AnnService/src/Core/SPANN/SPANNIndex.cpp index 953d2974..fbd6955d 100644 --- a/AnnService/src/Core/SPANN/SPANNIndex.cpp +++ b/AnnService/src/Core/SPANN/SPANNIndex.cpp @@ -102,7 +102,7 @@ namespace SPTAG omp_set_num_threads(m_options.m_iSSDNumberOfThreads); m_workSpacePool.reset(new COMMON::WorkSpacePool()); - m_workSpacePool->Init(m_options.m_iSSDNumberOfThreads, m_options.m_maxCheck, m_options.m_hashExp, m_options.m_searchInternalResultNum, max(m_options.m_postingPageLimit, m_options.m_searchPostingPageLimit + 1) << PageSizeEx, m_options.m_enableDataCompression); + m_workSpacePool->Init(m_options.m_iSSDNumberOfThreads, m_options.m_maxCheck, m_options.m_hashExp, m_options.m_searchInternalResultNum, max(m_options.m_postingPageLimit, m_options.m_searchPostingPageLimit + 1) << PageSizeEx, int(m_options.m_enableDataCompression)); return ErrorCode::Success; } @@ -134,7 +134,7 @@ namespace SPTAG omp_set_num_threads(m_options.m_iSSDNumberOfThreads); m_workSpacePool.reset(new COMMON::WorkSpacePool()); - m_workSpacePool->Init(m_options.m_iSSDNumberOfThreads, m_options.m_maxCheck, m_options.m_hashExp, m_options.m_searchInternalResultNum, max(m_options.m_postingPageLimit, m_options.m_searchPostingPageLimit + 1) << PageSizeEx); + m_workSpacePool->Init(m_options.m_iSSDNumberOfThreads, m_options.m_maxCheck, m_options.m_hashExp, m_options.m_searchInternalResultNum, max(m_options.m_postingPageLimit, m_options.m_searchPostingPageLimit + 1) << PageSizeEx, int(m_options.m_enableDataCompression)); return ErrorCode::Success; } @@ -707,7 +707,7 @@ namespace SPTAG } m_workSpacePool.reset(new COMMON::WorkSpacePool()); - m_workSpacePool->Init(m_options.m_iSSDNumberOfThreads, m_options.m_maxCheck, m_options.m_hashExp, m_options.m_searchInternalResultNum, max(m_options.m_postingPageLimit, m_options.m_searchPostingPageLimit + 1) << PageSizeEx); + m_workSpacePool->Init(m_options.m_iSSDNumberOfThreads, m_options.m_maxCheck, m_options.m_hashExp, m_options.m_searchInternalResultNum, max(m_options.m_postingPageLimit, m_options.m_searchPostingPageLimit + 1) << PageSizeEx, int(m_options.m_enableDataCompression)); m_bReady = true; return ErrorCode::Success; } @@ -759,7 +759,7 @@ namespace SPTAG omp_set_num_threads(m_options.m_iSSDNumberOfThreads); m_index->UpdateIndex(); m_workSpacePool.reset(new COMMON::WorkSpacePool()); - m_workSpacePool->Init(m_options.m_iSSDNumberOfThreads, m_options.m_maxCheck, m_options.m_hashExp, m_options.m_searchInternalResultNum, max(m_options.m_postingPageLimit, m_options.m_searchPostingPageLimit + 1) << PageSizeEx); + m_workSpacePool->Init(m_options.m_iSSDNumberOfThreads, m_options.m_maxCheck, m_options.m_hashExp, m_options.m_searchInternalResultNum, max(m_options.m_postingPageLimit, m_options.m_searchPostingPageLimit + 1) << PageSizeEx, int(m_options.m_enableDataCompression)); return ErrorCode::Success; } From 607f8c71ccf87371736e24d60e6842d23758d8fe Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Wed, 8 Jun 2022 14:06:07 +0000 Subject: [PATCH 21/25] code refine --- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index c21f715c..456870df 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -164,7 +164,6 @@ namespace SPTAG #endif bool oneContext = (m_indexFiles.size() == 1); - for (uint32_t pi = 0; pi < postingListCount; ++pi) { auto curPostingID = p_exWorkSpace->m_postingIDs[pi]; @@ -394,7 +393,6 @@ namespace SPTAG int numThreads = p_opt.m_iSSDNumberOfThreads; int candidateNum = p_opt.m_internalResultNum; - // get headVectorIDs std::unordered_set headVectorIDS; if (p_opt.m_headIDFile.empty()) { LOG(Helper::LogLevel::LL_Error, "Not found VectorIDTranslate!\n"); @@ -652,10 +650,9 @@ namespace SPTAG // iterate over files for (int i = 0; i < p_opt.m_ssdIndexFileNum; i++) { - // postingFileSize: number of posting lists in the file - // postingListSize: number of vectors in the posting list, type vector size_t curPostingListOffSet = i * postingFileSize; size_t curPostingListEnd = min(postingListSize.size(), (i + 1) * postingFileSize); + // postingListSize: number of vectors in the posting list, type vector std::vector curPostingListSizes( postingListSize.begin() + curPostingListOffSet, postingListSize.begin() + curPostingListEnd); @@ -671,7 +668,6 @@ namespace SPTAG // LoadBatch: select vectors for each posting list if (p_opt.m_ssdIndexFileNum > 1) selections.LoadBatch(selectionsBatchOffset[i], selectionsBatchOffset[i + 1]); - // write one file OutputSSDIndexFile((i == 0) ? outputFile : outputFile + "_" + std::to_string(i), p_opt.m_enableDeltaEncoding, p_opt.m_enablePostingListRearrange, @@ -797,12 +793,13 @@ namespace SPTAG LOG(Helper::LogLevel::LL_Error, "Failed to read head info file!\n"); exit(1); } + m_listInfos[i].listOffset = (static_cast(m_listPageOffset + pageNum) << PageSizeEx); if (!m_enableDataCompression) { m_listInfos[i].listTotalBytes = m_listInfos[i].listEleCount * m_vectorInfoSize; + m_listInfos[i].listEleCount = min(m_listInfos[i].listEleCount, (min(static_cast(m_listInfos[i].listPageCount), p_postingPageLimit) << PageSizeEx) / m_vectorInfoSize); + m_listInfos[i].listPageCount = static_cast(ceil((m_vectorInfoSize * m_listInfos[i].listEleCount + m_listInfos[i].pageOffset) * 1.0 / (1 << PageSizeEx))); } - - m_listInfos[i].listOffset = (static_cast(m_listPageOffset + pageNum) << PageSizeEx); totalListElementCount += m_listInfos[i].listEleCount; int pageCount = m_listInfos[i].listPageCount; @@ -843,15 +840,6 @@ namespace SPTAG } delete[] dictBuffer; } - - if (!m_enableDataCompression) - { - for (int i = 0; i < m_listCount; ++i) - { - m_listInfos[i].listEleCount = min(m_listInfos[i].listEleCount, (min(static_cast(m_listInfos[i].listPageCount), p_postingPageLimit) << PageSizeEx) / m_vectorInfoSize); - m_listInfos[i].listPageCount = static_cast(ceil((m_vectorInfoSize * m_listInfos[i].listEleCount + m_listInfos[i].pageOffset) * 1.0 / (1 << PageSizeEx))); - } - } LOG(Helper::LogLevel::LL_Info, "Finish reading header info, list count %d, total doc count %d, dimension %d, list page offset %d.\n", From 7645ff80635b5bf21295f19a99d9435329ae1168 Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Thu, 9 Jun 2022 03:43:46 +0000 Subject: [PATCH 22/25] optimize mem usage --- AnnService/SSDServing.vcxproj | 4 +-- .../inc/Core/SPANN/ExtraFullGraphSearcher.h | 27 +++++++++---------- AnnService/inc/Core/SPANN/IExtraSearcher.h | 7 ++--- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/AnnService/SSDServing.vcxproj b/AnnService/SSDServing.vcxproj index 4dd1b372..b6613509 100644 --- a/AnnService/SSDServing.vcxproj +++ b/AnnService/SSDServing.vcxproj @@ -54,7 +54,7 @@ MultiByte - Application + StaticLibrary false v142 true @@ -158,7 +158,7 @@ true /Zc:twoPhase- %(AdditionalOptions) Default - Disabled + MaxSpeed Console diff --git a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h index 456870df..815530b0 100644 --- a/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraFullGraphSearcher.h @@ -81,8 +81,14 @@ namespace SPTAG } }; -#define ProcessPosting(p_postingListFullData, vectorInfoSize, m_enablePostingListRearrange) \ +#define ProcessPosting(p_postingListFullData, vectorInfoSize, m_enablePostingListRearrange, m_enableDeltaEncoding, headVector) \ for (int i = 0; i < listInfo->listEleCount; i++) { \ + if (m_enableDeltaEncoding) { \ + ValueType* leaf = m_enablePostingListRearrange ? reinterpret_cast(p_postingListFullData + (vectorInfoSize - sizeof(int)) * i) : reinterpret_cast(p_postingListFullData + vectorInfoSize * i + sizeof(int)); \ + for (auto i = 0; i < p_index->GetFeatureDim(); i++) { \ + leaf[i] += headVector[i]; \ + } \ + } \ uint64_t offsetVectorID = m_enablePostingListRearrange ? (vectorInfoSize - sizeof(int)) * listInfo->listEleCount + sizeof(int) * i : vectorInfoSize * i; \ int vectorID = *(reinterpret_cast(p_postingListFullData + offsetVectorID));\ if (p_exWorkSpace->m_deduper.CheckAndSet(vectorID)) continue; \ @@ -200,7 +206,7 @@ namespace SPTAG #ifdef BATCH_READ // async batch read auto vectorInfoSize = m_vectorInfoSize; - request.m_callback = [&p_exWorkSpace, pi, &queryResults, &p_index, vectorInfoSize, curPostingID, m_enableDeltaEncoding = m_enableDeltaEncoding, m_enablePostingListRearrange = m_enablePostingListRearrange, m_enableDictTraining = m_enableDictTraining, m_enableDataCompression = m_enableDataCompression, &m_pCompressor = m_pCompressor](Helper::AsyncReadRequest *request) + request.m_callback = [&p_exWorkSpace, &queryResults, &p_index, vectorInfoSize, curPostingID, m_enableDeltaEncoding = m_enableDeltaEncoding, m_enablePostingListRearrange = m_enablePostingListRearrange, m_enableDictTraining = m_enableDictTraining, m_enableDataCompression = m_enableDataCompression, &m_pCompressor = m_pCompressor](Helper::AsyncReadRequest *request) { char* buffer = request->m_buffer; ListInfo* listInfo = (ListInfo*)(request->m_payload); @@ -209,7 +215,7 @@ namespace SPTAG char* p_postingListFullData = buffer + listInfo->pageOffset; if (m_enableDataCompression) { - p_postingListFullData = (char*)p_exWorkSpace->m_decompressBuffers[pi].GetBuffer(); + p_postingListFullData = (char*)p_exWorkSpace->m_decompressBuffer.GetBuffer(); if (listInfo->listEleCount != 0) { std::size_t sizePostingListFullData; @@ -229,20 +235,13 @@ namespace SPTAG } // delta encoding + ValueType* headVector = nullptr; if (m_enableDeltaEncoding) { - ValueType *headVector = (ValueType *)p_index->GetSample(curPostingID); - for (int i = 0; i < listInfo->listEleCount; i++) - { - ValueType *leaf = m_enablePostingListRearrange ? reinterpret_cast(p_postingListFullData + (vectorInfoSize - sizeof(int)) * i) : reinterpret_cast(p_postingListFullData + vectorInfoSize * i + sizeof(int)); - for (auto i = 0; i < p_index->GetFeatureDim(); i++) - { - leaf[i] += headVector[i]; - } - } + headVector = (ValueType*)p_index->GetSample(curPostingID); } - ProcessPosting(const_cast(p_postingListFullData), vectorInfoSize, m_enablePostingListRearrange); + ProcessPosting(const_cast(p_postingListFullData), vectorInfoSize, m_enablePostingListRearrange, m_enableDeltaEncoding, headVector); }; #else // async read request.m_callback = [&p_exWorkSpace](Helper::AsyncReadRequest* request) @@ -294,7 +293,7 @@ namespace SPTAG char* p_postingListFullData = buffer + listInfo->pageOffset; if (m_enableDataCompression) { - p_postingListFullData = (char*)p_exWorkSpace->m_decompressBuffers[pi].GetBuffer(); + p_postingListFullData = (char*)p_exWorkSpace->m_decompressBuffer.GetBuffer(); if (listInfo->listEleCount != 0) { try { diff --git a/AnnService/inc/Core/SPANN/IExtraSearcher.h b/AnnService/inc/Core/SPANN/IExtraSearcher.h index d6ac7a07..8b5d5d07 100644 --- a/AnnService/inc/Core/SPANN/IExtraSearcher.h +++ b/AnnService/inc/Core/SPANN/IExtraSearcher.h @@ -123,10 +123,7 @@ namespace SPTAG { m_diskRequests.resize(p_internalResultNum); m_enableDataCompression = enableDataCompression; if (enableDataCompression) { - m_decompressBuffers.resize(p_internalResultNum); - for (int pi = 0; pi < p_internalResultNum; pi++) { - m_decompressBuffers[pi].ReservePageBuffer(p_maxPages); - } + m_decompressBuffer.ReservePageBuffer(p_maxPages); } } @@ -150,7 +147,7 @@ namespace SPTAG { std::vector> m_pageBuffers; bool m_enableDataCompression; - std::vector> m_decompressBuffers; + PageBuffer m_decompressBuffer; std::vector m_diskRequests; From 3ad6a101127abfbcee6e9f41e92a893200e25fca Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Thu, 9 Jun 2022 04:30:46 +0000 Subject: [PATCH 23/25] replace vcpkg zstd with nuget --- AnnService/CoreLibrary.vcxproj | 18 +++++++----------- AnnService/SSDServing.vcxproj | 12 +++++++----- AnnService/SSDServing.vcxproj.filters | 3 +++ AnnService/packages.config | 3 ++- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/AnnService/CoreLibrary.vcxproj b/AnnService/CoreLibrary.vcxproj index b7328bbd..6446461a 100644 --- a/AnnService/CoreLibrary.vcxproj +++ b/AnnService/CoreLibrary.vcxproj @@ -99,11 +99,7 @@ ProgramDatabase /Zc:twoPhase- /Zc:__cplusplus %(AdditionalOptions) stdcpp17 - $(SolutionDir)\vcpkg\installed\x64-windows\include - - $(SolutionDir)\vcpkg\installed\x64-windows\lib\zstd.lib - @@ -132,18 +128,11 @@ _MBCS;_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) /Zc:twoPhase- /Zc:__cplusplus %(AdditionalOptions) Default - $(SolutionDir)\vcpkg\installed\x64-windows\include true true - - $(SolutionDir)\vcpkg\installed\x64-windows\lib\zstd.lib - - - XCOPY $(SolutionDir)\vcpkg\installed\x64-windows\bin "$(TargetDir)" /D /K /Y - @@ -231,5 +220,12 @@ + + + + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. + + + \ No newline at end of file diff --git a/AnnService/SSDServing.vcxproj b/AnnService/SSDServing.vcxproj index b6613509..3aa0f977 100644 --- a/AnnService/SSDServing.vcxproj +++ b/AnnService/SSDServing.vcxproj @@ -27,6 +27,9 @@ + + + 15.0 {217B42B7-8F2B-4323-804C-08992CA2F65E} @@ -152,7 +155,7 @@ true _$(OutputType);_MBCS;_SCL_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true - $(SolutionDir)\vcpkg\installed\x64-windows\include;%(AdditionalIncludeDirectories) + %(AdditionalIncludeDirectories) NotUsing inc/SSDServing/Common/stdafx.h true @@ -166,11 +169,8 @@ true true %(AdditionalLibraryDirectories) - CoreLibrary.lib;$(SolutionDir)\vcpkg\installed\x64-windows\lib\zstd.lib;%(AdditionalDependencies) + CoreLibrary.lib;%(AdditionalDependencies) - - XCOPY $(SolutionDir)\vcpkg\installed\x64-windows\bin "$(TargetDir)" /D /K /Y - @@ -181,6 +181,7 @@ + @@ -193,5 +194,6 @@ + \ No newline at end of file diff --git a/AnnService/SSDServing.vcxproj.filters b/AnnService/SSDServing.vcxproj.filters index 8f36f5f3..95d60611 100644 --- a/AnnService/SSDServing.vcxproj.filters +++ b/AnnService/SSDServing.vcxproj.filters @@ -27,4 +27,7 @@ Source Files + + + \ No newline at end of file diff --git a/AnnService/packages.config b/AnnService/packages.config index 991c8539..4ad04937 100644 --- a/AnnService/packages.config +++ b/AnnService/packages.config @@ -2,10 +2,11 @@ + - + \ No newline at end of file From 7a7f4bcb61aabc9b45e45a9ff79b1e079fc6c994 Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Thu, 9 Jun 2022 05:09:24 +0000 Subject: [PATCH 24/25] config issue --- AnnService/CoreLibrary.vcxproj | 2 +- AnnService/SSDServing.vcxproj | 5 ++--- SPTAG.nuspec | 1 + 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/AnnService/CoreLibrary.vcxproj b/AnnService/CoreLibrary.vcxproj index 6446461a..4cd347d3 100644 --- a/AnnService/CoreLibrary.vcxproj +++ b/AnnService/CoreLibrary.vcxproj @@ -127,7 +127,7 @@ true _MBCS;_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) /Zc:twoPhase- /Zc:__cplusplus %(AdditionalOptions) - Default + stdcpp17 true diff --git a/AnnService/SSDServing.vcxproj b/AnnService/SSDServing.vcxproj index 3aa0f977..f6676597 100644 --- a/AnnService/SSDServing.vcxproj +++ b/AnnService/SSDServing.vcxproj @@ -1,4 +1,4 @@ - + @@ -160,8 +160,7 @@ inc/SSDServing/Common/stdafx.h true /Zc:twoPhase- %(AdditionalOptions) - Default - MaxSpeed + stdcpp17 Console diff --git a/SPTAG.nuspec b/SPTAG.nuspec index 0dd57528..b739a1b7 100644 --- a/SPTAG.nuspec +++ b/SPTAG.nuspec @@ -48,6 +48,7 @@ + From 386d5dc25415024e6b63a3bc4aab23ca0eeea123 Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Thu, 9 Jun 2022 12:24:45 +0000 Subject: [PATCH 25/25] fix solution build issue --- AnnService/Aggregator.vcxproj | 2 ++ AnnService/Client.vcxproj | 2 ++ AnnService/IndexBuilder.vcxproj | 2 ++ AnnService/IndexSearcher.vcxproj | 2 ++ AnnService/Quantizer.vcxproj | 2 ++ AnnService/Server.vcxproj | 2 ++ Test/Test.vcxproj | 2 ++ Test/packages.config | 1 + Wrappers/CLRCore.vcxproj | 2 ++ Wrappers/CsharpClient.vcxproj | 2 ++ Wrappers/CsharpCore.vcxproj | 2 ++ Wrappers/PythonClient.vcxproj | 2 ++ Wrappers/PythonCore.vcxproj | 2 ++ Wrappers/packages.config | 1 + 14 files changed, 26 insertions(+) diff --git a/AnnService/Aggregator.vcxproj b/AnnService/Aggregator.vcxproj index af24dded..4946c13f 100644 --- a/AnnService/Aggregator.vcxproj +++ b/AnnService/Aggregator.vcxproj @@ -165,6 +165,7 @@ + @@ -177,5 +178,6 @@ + \ No newline at end of file diff --git a/AnnService/Client.vcxproj b/AnnService/Client.vcxproj index f88234be..9381af59 100644 --- a/AnnService/Client.vcxproj +++ b/AnnService/Client.vcxproj @@ -132,6 +132,7 @@ + @@ -144,5 +145,6 @@ + \ No newline at end of file diff --git a/AnnService/IndexBuilder.vcxproj b/AnnService/IndexBuilder.vcxproj index f82825fa..0900590c 100644 --- a/AnnService/IndexBuilder.vcxproj +++ b/AnnService/IndexBuilder.vcxproj @@ -155,6 +155,7 @@ + @@ -167,5 +168,6 @@ + \ No newline at end of file diff --git a/AnnService/IndexSearcher.vcxproj b/AnnService/IndexSearcher.vcxproj index 88214858..6d137837 100644 --- a/AnnService/IndexSearcher.vcxproj +++ b/AnnService/IndexSearcher.vcxproj @@ -156,6 +156,7 @@ + @@ -168,5 +169,6 @@ + \ No newline at end of file diff --git a/AnnService/Quantizer.vcxproj b/AnnService/Quantizer.vcxproj index bdcebcd0..942e55e1 100644 --- a/AnnService/Quantizer.vcxproj +++ b/AnnService/Quantizer.vcxproj @@ -171,6 +171,7 @@ + @@ -183,5 +184,6 @@ + \ No newline at end of file diff --git a/AnnService/Server.vcxproj b/AnnService/Server.vcxproj index fe9a7c8d..3b38afe4 100644 --- a/AnnService/Server.vcxproj +++ b/AnnService/Server.vcxproj @@ -140,6 +140,7 @@ + @@ -152,5 +153,6 @@ + \ No newline at end of file diff --git a/Test/Test.vcxproj b/Test/Test.vcxproj index 68e67dac..cba1cb90 100644 --- a/Test/Test.vcxproj +++ b/Test/Test.vcxproj @@ -176,6 +176,7 @@ + @@ -190,5 +191,6 @@ + \ No newline at end of file diff --git a/Test/packages.config b/Test/packages.config index 27713806..8adb5ddb 100644 --- a/Test/packages.config +++ b/Test/packages.config @@ -9,4 +9,5 @@ + \ No newline at end of file diff --git a/Wrappers/CLRCore.vcxproj b/Wrappers/CLRCore.vcxproj index 9aa9db77..dcbf66bf 100644 --- a/Wrappers/CLRCore.vcxproj +++ b/Wrappers/CLRCore.vcxproj @@ -150,6 +150,7 @@ + @@ -162,5 +163,6 @@ + \ No newline at end of file diff --git a/Wrappers/CsharpClient.vcxproj b/Wrappers/CsharpClient.vcxproj index ed558eb7..1e48b766 100644 --- a/Wrappers/CsharpClient.vcxproj +++ b/Wrappers/CsharpClient.vcxproj @@ -169,6 +169,7 @@ + @@ -189,5 +190,6 @@ + \ No newline at end of file diff --git a/Wrappers/CsharpCore.vcxproj b/Wrappers/CsharpCore.vcxproj index 692d418e..eea06963 100644 --- a/Wrappers/CsharpCore.vcxproj +++ b/Wrappers/CsharpCore.vcxproj @@ -54,6 +54,7 @@ + @@ -132,5 +133,6 @@ This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. + \ No newline at end of file diff --git a/Wrappers/PythonClient.vcxproj b/Wrappers/PythonClient.vcxproj index afdfb14e..25f5b729 100644 --- a/Wrappers/PythonClient.vcxproj +++ b/Wrappers/PythonClient.vcxproj @@ -166,6 +166,7 @@ + @@ -184,5 +185,6 @@ + \ No newline at end of file diff --git a/Wrappers/PythonCore.vcxproj b/Wrappers/PythonCore.vcxproj index 64fdc8be..db9b1184 100644 --- a/Wrappers/PythonCore.vcxproj +++ b/Wrappers/PythonCore.vcxproj @@ -117,6 +117,7 @@ + @@ -130,5 +131,6 @@ + \ No newline at end of file diff --git a/Wrappers/packages.config b/Wrappers/packages.config index 784b338f..d4000416 100644 --- a/Wrappers/packages.config +++ b/Wrappers/packages.config @@ -9,4 +9,5 @@ + \ No newline at end of file