Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support data compression & delta-encoding of posting lists #297

Merged
merged 28 commits into from
Jun 9, 2022
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
6583ee5
MVP version of Data compress (#1)
suiguoxin May 20, 2022
f61c390
Support delta-encoding (#2)
suiguoxin May 24, 2022
1003594
support config of compress level (#3)
suiguoxin May 25, 2022
d17a126
integrate zstd with cmake (#4)
suiguoxin May 25, 2022
83efc3f
Bug Fix: wrong listPageCount when listTotalBytes % pageSize==0 (#5)
suiguoxin May 29, 2022
0f33ca3
train & share dictionary (#6)
suiguoxin May 30, 2022
17a2150
rearrange posting list (#7)
suiguoxin May 30, 2022
f502072
config minDictTraingBufferSize and dictBufferCapacity (#8)
suiguoxin May 31, 2022
223bdd3
cmake with local installed zstd (#9)
suiguoxin May 31, 2022
e23174a
refine visual studio config (#10)
suiguoxin May 31, 2022
d6e7f41
parallel for get compressed size (#11)
suiguoxin May 31, 2022
76037b3
fix check truth bug (#12)
suiguoxin Jun 1, 2022
138b8da
change zstd branch (#13)
suiguoxin Jun 1, 2022
161e7a8
remove verbose log in truth analysis; refine dockerfile
suiguoxin Jun 1, 2022
063c847
check rvalue in Compressor.h
suiguoxin Jun 5, 2022
4d692ed
resolve back compatibility by add a search option: WithDataCompressio…
suiguoxin Jun 7, 2022
aaa1fbe
remove redundant configs, change head info format only when compressi…
suiguoxin Jun 8, 2022
4731e91
reuse buffer when decompression
suiguoxin Jun 8, 2022
9bf097a
remove exit(1) from search index
suiguoxin Jun 8, 2022
bbe8a6d
bug fix: reuse compression buffer in workspace
suiguoxin Jun 8, 2022
607f8c7
code refine
suiguoxin Jun 8, 2022
0dc7cdb
Merge branch 'main' into fb-data-compress
PhilipBAdams Jun 8, 2022
7645ff8
optimize mem usage
suiguoxin Jun 9, 2022
1e4c62d
Merge branch 'fb-data-compress' of github.com:suiguoxin/SPTAG into fb…
suiguoxin Jun 9, 2022
3ad6a10
replace vcpkg zstd with nuget
suiguoxin Jun 9, 2022
7a7f4bc
config issue
suiguoxin Jun 9, 2022
386d5dc
fix solution build issue
suiguoxin Jun 9, 2022
c9c35fd
Merge branch 'main' into fb-data-compress
suiguoxin Jun 9, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[submodule "ThirdParty/zstd"]
path = ThirdParty/zstd
url = https://github.com/facebook/zstd
branch = release
6 changes: 4 additions & 2 deletions AnnService/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
# Licensed under the MIT License.

set(AnnService ${PROJECT_SOURCE_DIR}/AnnService)
set(Zstd ${PROJECT_SOURCE_DIR}/ThirdParty/zstd)

include_directories(${AnnService})
include_directories(${Zstd}/lib)

file(GLOB_RECURSE HDR_FILES ${AnnService}/inc/Core/*.h ${AnnService}/inc/Helper/*.h)
file(GLOB_RECURSE SRC_FILES ${AnnService}/src/Core/*.cpp ${AnnService}/src/Helper/*.cpp)
Expand Down Expand Up @@ -32,9 +34,9 @@ if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
endif()

add_library (SPTAGLib SHARED ${SRC_FILES} ${HDR_FILES})
target_link_libraries (SPTAGLib DistanceUtils)
target_link_libraries (SPTAGLib DistanceUtils libzstd_shared)
add_library (SPTAGLibStatic STATIC ${SRC_FILES} ${HDR_FILES})
target_link_libraries (SPTAGLibStatic DistanceUtils)
target_link_libraries (SPTAGLibStatic DistanceUtils libzstd_static)
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
target_compile_options(SPTAGLibStatic PRIVATE -fPIC)
endif()
Expand Down
14 changes: 13 additions & 1 deletion AnnService/CoreLibrary.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,11 @@
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<AdditionalOptions>/Zc:twoPhase- /Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(SolutionDir)\vcpkg\installed\x64-windows\include</AdditionalIncludeDirectories>
</ClCompile>
<Lib>
<AdditionalDependencies>$(SolutionDir)\vcpkg\installed\x64-windows\lib\zstd.lib</AdditionalDependencies>
</Lib>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
Expand Down Expand Up @@ -127,12 +131,19 @@
<OpenMPSupport>true</OpenMPSupport>
<PreprocessorDefinitions>_MBCS;_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalOptions>/Zc:twoPhase- /Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
<LanguageStandard>stdcpp17</LanguageStandard>
<LanguageStandard>Default</LanguageStandard>
<AdditionalIncludeDirectories>$(SolutionDir)\vcpkg\installed\x64-windows\include</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
<Lib>
<AdditionalDependencies>$(SolutionDir)\vcpkg\installed\x64-windows\lib\zstd.lib</AdditionalDependencies>
</Lib>
<PostBuildEvent>
<Command>XCOPY $(SolutionDir)\vcpkg\installed\x64-windows\bin "$(TargetDir)" /D /K /Y</Command>
</PostBuildEvent>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="inc\Core\Common\FineGrainedLock.h" />
Expand Down Expand Up @@ -160,6 +171,7 @@
<ClInclude Include="inc\Core\MetadataSet.h" />
<ClInclude Include="inc\Core\SearchQuery.h" />
<ClInclude Include="inc\Core\SearchResult.h" />
<ClInclude Include="inc\Core\SPANN\Compressor.h" />
<ClInclude Include="inc\Core\SPANN\ExtraFullGraphSearcher.h" />
<ClInclude Include="inc\Core\SPANN\IExtraSearcher.h" />
<ClInclude Include="inc\Core\SPANN\Index.h" />
Expand Down
3 changes: 3 additions & 0 deletions AnnService/CoreLibrary.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,9 @@
<ClInclude Include="inc\Core\Common\OPQQuantizer.h">
<Filter>Header Files\Core\Common</Filter>
</ClInclude>
<ClInclude Include="inc\Core\SPANN\Compressor.h">
<Filter>Header Files\Core\SPANN</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="src\Core\VectorIndex.cpp">
Expand Down
10 changes: 7 additions & 3 deletions AnnService/SSDServing.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -152,21 +152,25 @@
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>_$(OutputType);_MBCS;_SCL_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>$(SolutionDir)\vcpkg\installed\x64-windows\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<PrecompiledHeaderFile>inc/SSDServing/Common/stdafx.h</PrecompiledHeaderFile>
<OpenMPSupport>true</OpenMPSupport>
<AdditionalOptions>/Zc:twoPhase- %(AdditionalOptions)</AdditionalOptions>
<LanguageStandard>stdcpp17</LanguageStandard>
<LanguageStandard>Default</LanguageStandard>
<Optimization>Disabled</Optimization>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>CoreLibrary.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>CoreLibrary.lib;$(SolutionDir)\vcpkg\installed\x64-windows\lib\zstd.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<PostBuildEvent>
<Command>XCOPY $(SolutionDir)\vcpkg\installed\x64-windows\bin "$(TargetDir)" /D /K /Y</Command>
</PostBuildEvent>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
Expand Down
203 changes: 203 additions & 0 deletions AnnService/inc/Core/SPANN/Compressor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#ifndef _SPTAG_SPANN_COMPRESSOR_H_
#define _SPTAG_SPANN_COMPRESSOR_H_

#include <string>
#include "zstd.h"
#include "zdict.h"
#include "../Common.h"

namespace SPTAG
{
namespace SPANN
{
class Compressor
{
private:
void CreateCDict()
{
cdict = ZSTD_createCDict((void *)dictBuffer.data(), dictBuffer.size(), compress_level);
if (cdict == NULL)
{
LOG(Helper::LogLevel::LL_Error, "ZSTD_createCDict() failed! \n");
exit(1);
}
}

void CreateDDict()
{
ddict = ZSTD_createDDict((void *)dictBuffer.data(), dictBuffer.size());
if (ddict == NULL)
{
LOG(Helper::LogLevel::LL_Error, "ZSTD_createDDict() failed! \n");
exit(1);
}
}

std::string CompressWithDict(const std::string &src)
{
size_t est_compress_size = ZSTD_compressBound(src.size());
std::string comp_buffer{};
comp_buffer.resize(est_compress_size);

ZSTD_CCtx *const cctx = ZSTD_createCCtx();
if (cctx == NULL)
{
LOG(Helper::LogLevel::LL_Error, "ZSTD_createCCtx() failed! \n");
exit(1);
}
size_t compressed_size = ZSTD_compress_usingCDict(cctx, (void *)comp_buffer.data(), est_compress_size, src.data(), src.size(), cdict);
if (ZSTD_isError(compressed_size))
{
LOG(Helper::LogLevel::LL_Error, "ZSTD compress error %s, \n", ZSTD_getErrorName(compressed_size));
exit(1);
}
ZSTD_freeCCtx(cctx);
comp_buffer.resize(compressed_size);
comp_buffer.shrink_to_fit();

return comp_buffer;
}

std::string DecompressWithDict(const char *src, size_t srcSize)
{
auto const est_decomp_size =
ZSTD_getFrameContentSize(src, srcSize);

std::string decomp_buffer{};
decomp_buffer.resize(est_decomp_size);

ZSTD_DCtx *const dctx = ZSTD_createDCtx();
if (dctx == NULL)
{
LOG(Helper::LogLevel::LL_Error, "ZSTD_createDCtx() failed! \n");
exit(1);
}
size_t const decomp_size = ZSTD_decompress_usingDDict(dctx,
(void *)decomp_buffer.data(), est_decomp_size, src, srcSize, ddict);
if (ZSTD_isError(decomp_size))
{
LOG(Helper::LogLevel::LL_Error, "ZSTD decompress error %s, \n", ZSTD_getErrorName(decomp_size));
exit(1);
suiguoxin marked this conversation as resolved.
Show resolved Hide resolved
}

ZSTD_freeDCtx(dctx);
decomp_buffer.resize(decomp_size);
decomp_buffer.shrink_to_fit();
return decomp_buffer;
}

std::string CompressWithoutDict(const std::string &src)
{
size_t est_comp_size = ZSTD_compressBound(src.size());
std::string buffer{};
buffer.resize(est_comp_size);
size_t compressed_size = ZSTD_compress((void *)buffer.data(), est_comp_size,
src.data(), src.size(), compress_level);
if (ZSTD_isError(compressed_size))
{
LOG(Helper::LogLevel::LL_Error, "ZSTD compress error %s, \n", ZSTD_getErrorName(compressed_size));
exit(1);
}
buffer.resize(compressed_size);
buffer.shrink_to_fit();

return buffer;
}

std::string DecompressWithoutDict(const char *src, size_t srcSize)
{
size_t est_decomp_size = ZSTD_getFrameContentSize(src, srcSize);
if (est_decomp_size == ZSTD_CONTENTSIZE_ERROR)
{
LOG(Helper::LogLevel::LL_Error, "not compressed by zstd!\n");
exit(1);
}
else if (est_decomp_size == ZSTD_CONTENTSIZE_UNKNOWN)
{
LOG(Helper::LogLevel::LL_Error, "original size unknown!\n");
exit(1);
}
std::string dst{};
dst.resize(est_decomp_size);
size_t const decomp_size = ZSTD_decompress(
(void *)dst.data(), est_decomp_size, src, srcSize);
if (ZSTD_isError(decomp_size))
{
LOG(Helper::LogLevel::LL_Error, "ZSTD decompress error %s, \n", ZSTD_getErrorName(decomp_size));
exit(1);
}
dst.resize(decomp_size);
dst.shrink_to_fit();

return dst;
}

public:
Compressor(int level = 0, int bufferCapacity = 102400)
{
compress_level = level;
dictBufferCapacity = bufferCapacity;
}

virtual ~Compressor() {}

std::size_t TrainDict(std::string samplesBuffer, const size_t *samplesSizes, unsigned nbSamples)
{
dictBuffer.resize(dictBufferCapacity);
size_t dictSize = ZDICT_trainFromBuffer((void *)dictBuffer.data(), dictBufferCapacity, (void *)samplesBuffer.data(), &samplesSizes[0], nbSamples);
if (ZDICT_isError(dictSize))
{
LOG(Helper::LogLevel::LL_Error, "ZDICT_trainFromBuffer() failed: %s \n", ZDICT_getErrorName(dictSize));
exit(1);
}
dictBuffer.resize(dictSize);
dictBuffer.shrink_to_fit();

CreateCDict();

return dictSize;
}

std::string GetDictBuffer()
{
return dictBuffer;
}

void SetDictBuffer(std::string buffer)
PhilipBAdams marked this conversation as resolved.
Show resolved Hide resolved
{
dictBuffer = buffer;
CreateDDict();
}

std::string Compress(const std::string &src, const bool useDict)
{
return useDict ? CompressWithDict(src) : CompressWithoutDict(src);
}

std::string Decompress(const char *src, size_t srcSize, const bool useDict)
suiguoxin marked this conversation as resolved.
Show resolved Hide resolved
{
return useDict ? DecompressWithDict(src, srcSize) : DecompressWithoutDict(src, srcSize);
}

// return the compressed sie
size_t GetCompressedSize(const std::string &src, bool useDict)
{
std::string dst = Compress(src, useDict);
return dst.size();
}

private:
int compress_level;

std::string dictBuffer;
size_t dictBufferCapacity;
ZSTD_CDict *cdict;
ZSTD_DDict *ddict;
};
} // SPANN
} // SPTAG

#endif // _SPTAG_SPANN_COMPRESSOR_H_