Skip to content

Commit

Permalink
update fresh ANN implementation (#85)
Browse files Browse the repository at this point in the history
* remove dup code

* Update Readme.md

* Fix DataSet GNU compile fail bug

* fix GNU Windows align alloc bugs

* add copyright in each file

* fix copy right in dataset

* change kdt distance judgement

* change code structure and add more wrappers

* Update docs

* fix search result

* change IndexBuilder to support binary input data

* temp remove java related projects

* remove javaclient and javacore from the windows build

* Fix SetData issue

* Add vector record count and dimension for reuse and debug

* change default parameter definition

* add uint8 support

* small fix for cosine distance of uint8

* fix AVX distance calculation epu8

* update readme

* Update DistanceUtils.h

* fix python wrapper cannot load larger than 4G memory error

* try to add C# wrapper

* fix owner of C# wrapper

* add C# cmake support

* fix byte array copy

* fix tab to space

* Try to make shared_ptr<T> as Array template

* fix copy

* add Parameters documents

* remove tbb dependency

* fix concurrent_set

* fix gcc 5.x cannot support shared_mutex

* move concurrentset to Helper folder and change find to contains

* Update README.md

* try to use shared_lock to replace lock and unlock, try to use block to manage the increased memory

* fix filling -1

* fix initialization

* change to memset

* add CLR CoreInterface for managed dll

* try to reserve incBlocks capacity

* fix return ErrorCode for AddBatch in Dataset.h

* change return type to ErrorCode for AddBatch

* remove the tbb dependency (#71) (#10)

* remove dup code

* Update Readme.md

* Fix DataSet GNU compile fail bug

* fix GNU Windows align alloc bugs

* add copyright in each file

* fix copy right in dataset

* change kdt distance judgement

* change code structure and add more wrappers

* Update docs

* fix search result

* change IndexBuilder to support binary input data

* temp remove java related projects

* remove javaclient and javacore from the windows build

* Fix SetData issue

* Add vector record count and dimension for reuse and debug

* change default parameter definition

* add uint8 support

* small fix for cosine distance of uint8

* fix AVX distance calculation epu8

* update readme

* Update DistanceUtils.h

* fix python wrapper cannot load larger than 4G memory error

* try to add C# wrapper

* fix owner of C# wrapper

* add C# cmake support

* fix byte array copy

* fix tab to space

* Try to make shared_ptr<T> as Array template

* fix copy

* add Parameters documents

* remove tbb dependency

* fix concurrent_set

* fix gcc 5.x cannot support shared_mutex

* move concurrentset to Helper folder and change find to contains

* Update README.md

* try to use shared_lock to replace lock and unlock, try to use block to manage the increased memory

* fix filling -1

* fix initialization

* change to memset

* add CLR CoreInterface for managed dll

* try to reserve incBlocks capacity

* fix return ErrorCode for AddBatch in Dataset.h

* change return type to ErrorCode for AddBatch

* fix type definition

* change incremental update design

* fix all type

* fix debug mode memory delete assert

* add deletePercentageForRefine judgement

* add dump and load from byte array

* add dump and load from byte array

* fix getNumThreads

* fix loadindex and add index bugs

* Update AlgoTest to add metamapping test

* fix compling error in g++7

* fix largest cluster cannot be split during clustering
  • Loading branch information
MaggieQi committed Aug 9, 2019
1 parent 4b9cf58 commit b42efa0
Show file tree
Hide file tree
Showing 49 changed files with 1,929 additions and 1,510 deletions.
1 change: 1 addition & 0 deletions AnnService/CoreLibrary.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@
<ClInclude Include="inc\Core\VectorSet.h" />
<ClInclude Include="inc\Helper\ArgumentsParser.h" />
<ClInclude Include="inc\Helper\Base64Encode.h" />
<ClInclude Include="inc\Helper\BufferStream.h" />
<ClInclude Include="inc\Helper\CommonHelper.h" />
<ClInclude Include="inc\Helper\Concurrent.h" />
<ClInclude Include="inc\Helper\ConcurrentSet.h" />
Expand Down
3 changes: 3 additions & 0 deletions AnnService/CoreLibrary.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,9 @@
<ClInclude Include="inc\Helper\ConcurrentSet.h">
<Filter>Header Files\Helper</Filter>
</ClInclude>
<ClInclude Include="inc\Helper\BufferStream.h">
<Filter>Header Files\Helper</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="src\Core\VectorIndex.cpp">
Expand Down
60 changes: 40 additions & 20 deletions AnnService/inc/Core/BKT/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,35 +48,38 @@ namespace SPTAG
std::string m_sBKTFilename;
std::string m_sGraphFilename;
std::string m_sDataPointsFilename;
std::string m_sDeleteDataPointsFilename;

std::mutex m_dataAddLock; // protect data and graph
COMMON::ConcurrentSet<int> m_deletedID;
Helper::Concurrent::ConcurrentSet<SizeType> m_deletedID;
float m_fDeletePercentageForRefine;
std::unique_ptr<COMMON::WorkSpacePool> m_workSpacePool;

int m_iNumberOfThreads;
DistCalcMethod m_iDistCalcMethod;
float(*m_fComputeDistance)(const T* pX, const T* pY, int length);
float(*m_fComputeDistance)(const T* pX, const T* pY, DimensionType length);

int m_iMaxCheck;
int m_iThresholdOfNumberOfContinuousNoBetterPropagation;
int m_iNumberOfInitialDynamicPivots;
int m_iNumberOfOtherDynamicPivots;
public:
Index()
{
Index()
{
#define DefineBKTParameter(VarName, VarType, DefaultValue, RepresentStr) \
VarName = DefaultValue; \

#include "inc/Core/BKT/ParameterDefinitionList.h"
#undef DefineBKTParameter

m_fComputeDistance = COMMON::DistanceCalcSelector<T>(m_iDistCalcMethod);
}
m_pSamples.SetName("Vector");
m_fComputeDistance = COMMON::DistanceCalcSelector<T>(m_iDistCalcMethod);
}

~Index() {}

inline int GetNumSamples() const { return m_pSamples.R(); }
inline int GetFeatureDim() const { return m_pSamples.C(); }
inline SizeType GetNumSamples() const { return m_pSamples.R(); }
inline DimensionType GetFeatureDim() const { return m_pSamples.C(); }

inline int GetCurrMaxCheck() const { return m_iMaxCheck; }
inline int GetNumThreads() const { return m_iNumberOfThreads; }
Expand All @@ -85,24 +88,41 @@ namespace SPTAG
inline VectorValueType GetVectorValueType() const { return GetEnumValueType<T>(); }

inline float ComputeDistance(const void* pX, const void* pY) const { return m_fComputeDistance((const T*)pX, (const T*)pY, m_pSamples.C()); }
inline const void* GetSample(const int idx) const { return (void*)m_pSamples[idx]; }

ErrorCode BuildIndex(const void* p_data, int p_vectorNum, int p_dimension);

ErrorCode LoadIndexFromMemory(const std::vector<void*>& p_indexBlobs);

ErrorCode SaveIndex(const std::string& p_folderPath, std::ofstream& p_configout);
ErrorCode LoadIndex(const std::string& p_folderPath, Helper::IniReader& p_reader);
inline const void* GetSample(const SizeType idx) const { return (void*)m_pSamples[idx]; }
inline bool ContainSample(const SizeType idx) const { return !m_deletedID.contains(idx); }
inline bool NeedRefine() const { return m_deletedID.size() >= (size_t)(GetNumSamples() * m_fDeletePercentageForRefine); }
std::shared_ptr<std::vector<std::uint64_t>> BufferSize() const
{
std::shared_ptr<std::vector<std::uint64_t>> buffersize(new std::vector<std::uint64_t>);
buffersize->push_back(m_pSamples.BufferSize());
buffersize->push_back(m_pTrees.BufferSize());
buffersize->push_back(m_pGraph.BufferSize());
buffersize->push_back(m_deletedID.bufferSize());
return std::move(buffersize);
}

ErrorCode SaveConfig(std::ostream& p_configout) const;
ErrorCode SaveIndexData(const std::string& p_folderPath);
ErrorCode SaveIndexData(const std::vector<std::ostream*>& p_indexStreams);

ErrorCode LoadConfig(Helper::IniReader& p_reader);
ErrorCode LoadIndexData(const std::string& p_folderPath);
ErrorCode LoadIndexDataFromMemory(const std::vector<ByteArray>& p_indexBlobs);

ErrorCode BuildIndex(const void* p_data, SizeType p_vectorNum, DimensionType p_dimension);
ErrorCode SearchIndex(QueryResult &p_query) const;
ErrorCode AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension);
ErrorCode DeleteIndex(const void* p_vectors, int p_vectorNum);
ErrorCode AddIndex(const void* p_vectors, SizeType p_vectorNum, DimensionType p_dimension, SizeType* p_start = nullptr);
ErrorCode DeleteIndex(const void* p_vectors, SizeType p_vectorNum);
ErrorCode DeleteIndex(const SizeType& p_id);

ErrorCode SetParameter(const char* p_param, const char* p_value);
std::string GetParameter(const char* p_param) const;

private:
ErrorCode RefineIndex(const std::string& p_folderPath);
void SearchIndexWithDeleted(COMMON::QueryResultSet<T> &p_query, COMMON::WorkSpace &p_space, const COMMON::ConcurrentSet<int> &p_deleted) const;
ErrorCode RefineIndex(const std::vector<std::ostream*>& p_indexStreams);

private:
void SearchIndexWithDeleted(COMMON::QueryResultSet<T> &p_query, COMMON::WorkSpace &p_space, const Helper::Concurrent::ConcurrentSet<SizeType> &p_deleted) const;
void SearchIndexWithoutDeleted(COMMON::QueryResultSet<T> &p_query, COMMON::WorkSpace &p_space) const;
};
} // namespace BKT
Expand Down
4 changes: 3 additions & 1 deletion AnnService/inc/Core/BKT/ParameterDefinitionList.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
DefineBKTParameter(m_sBKTFilename, std::string, std::string("tree.bin"), "TreeFilePath")
DefineBKTParameter(m_sGraphFilename, std::string, std::string("graph.bin"), "GraphFilePath")
DefineBKTParameter(m_sDataPointsFilename, std::string, std::string("vectors.bin"), "VectorFilePath")
DefineBKTParameter(m_sDeleteDataPointsFilename, std::string, std::string("deletes.bin"), "DeleteVectorFilePath")

DefineBKTParameter(m_pTrees.m_iTreeNumber, int, 1L, "BKTNumber")
DefineBKTParameter(m_pTrees.m_iBKTKmeansK, int, 32L, "BKTKmeansK")
Expand All @@ -18,7 +19,7 @@ DefineBKTParameter(m_pGraph.m_iTPTNumber, int, 32L, "TPTNumber")
DefineBKTParameter(m_pGraph.m_iTPTLeafSize, int, 2000L, "TPTLeafSize")
DefineBKTParameter(m_pGraph.m_numTopDimensionTPTSplit, int, 5L, "NumTopDimensionTpTreeSplit")

DefineBKTParameter(m_pGraph.m_iNeighborhoodSize, int, 32L, "NeighborhoodSize")
DefineBKTParameter(m_pGraph.m_iNeighborhoodSize, DimensionType, 32L, "NeighborhoodSize")
DefineBKTParameter(m_pGraph.m_iNeighborhoodScale, int, 2L, "GraphNeighborhoodScale")
DefineBKTParameter(m_pGraph.m_iCEFScale, int, 2L, "GraphCEFScale")
DefineBKTParameter(m_pGraph.m_iRefineIter, int, 0L, "RefineIterations")
Expand All @@ -28,6 +29,7 @@ DefineBKTParameter(m_pGraph.m_iMaxCheckForRefineGraph, int, 10000L, "MaxCheckFor
DefineBKTParameter(m_iNumberOfThreads, int, 1L, "NumberOfThreads")
DefineBKTParameter(m_iDistCalcMethod, SPTAG::DistCalcMethod, SPTAG::DistCalcMethod::Cosine, "DistCalcMethod")

DefineBKTParameter(m_fDeletePercentageForRefine, float, 0.4F, "DeletePercentageForRefine")
DefineBKTParameter(m_iMaxCheck, int, 8192L, "MaxCheck")
DefineBKTParameter(m_iThresholdOfNumberOfContinuousNoBetterPropagation, int, 3L, "ThresholdOfNumberOfContinuousNoBetterPropagation")
DefineBKTParameter(m_iNumberOfInitialDynamicPivots, int, 50L, "NumberOfInitialDynamicPivots")
Expand Down
2 changes: 1 addition & 1 deletion AnnService/inc/Core/Common.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ inline bool fileexists(const char* path) {

namespace SPTAG
{

typedef std::int32_t SizeType;
typedef std::int32_t DimensionType;

const SizeType MaxSize = (std::numeric_limits<SizeType>::max)();
const float MinDist = (std::numeric_limits<float>::min)();
Expand Down
Loading

0 comments on commit b42efa0

Please sign in to comment.