New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
LSHSearch Parallelization #700
Changes from 2 commits
72999dd
95d417f
afcc881
5db5423
abef504
4cbd43e
a60ff91
6152527
7cf77cd
2ca48c6
a6aca41
3d536c7
c04b073
65983d1
b95a3ce
0d38271
3af80c3
b02e2f3
a1e9c28
ad8e6d3
c4c8ff9
074d726
f982ca5
1fb998f
b92d465
2fee61e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,15 +18,16 @@ endif() | |
|
||
# First, define all the compilation options. | ||
# We default to debugging mode for developers. | ||
option(DEBUG "Compile with debugging information" ON) | ||
option(PROFILE "Compile with profiling information" ON) | ||
option(DEBUG "Compile with debugging information." ON) | ||
option(PROFILE "Compile with profiling information." ON) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops, this reverts 3fe0b72. |
||
option(ARMA_EXTRA_DEBUG "Compile with extra Armadillo debugging symbols." OFF) | ||
option(MATLAB_BINDINGS "Compile MATLAB bindings if MATLAB is found." OFF) | ||
option(TEST_VERBOSE "Run test cases with verbose output." OFF) | ||
option(BUILD_TESTS "Build tests." ON) | ||
option(BUILD_CLI_EXECUTABLES "Build command-line executables" ON) | ||
option(BUILD_SHARED_LIBS | ||
"Compile shared libraries (if OFF, static libraries are compiled)" ON) | ||
"Compile shared libraries (if OFF, static libraries are compiled)." ON) | ||
option(HAS_OPENMP "Use OpenMP for parallel execution." OFF) | ||
|
||
enable_testing() | ||
|
||
|
@@ -117,6 +118,11 @@ if(ARMA_EXTRA_DEBUG) | |
add_definitions(-DARMA_EXTRA_DEBUG) | ||
endif() | ||
|
||
# If the user has an OpenMP-enabled compiler, turn OpenMP on | ||
if (HAS_OPENMP) | ||
add_definitions(-DHAS_OPENMP) | ||
endif() | ||
|
||
# Now, find the libraries we need to compile against. Several variables can be | ||
# set to manually specify the directory in which each of these libraries | ||
# resides. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,16 @@ using std::cout; using std::endl; //TODO: remove | |
namespace mlpack { | ||
namespace neighbor { | ||
|
||
// Simple small function to set threads to 1 if OpenMP is not used | ||
inline size_t DefineMaxThreads() | ||
{ | ||
#ifdef _OPENMP | ||
return omp_get_max_threads(); | ||
#else | ||
return 1; | ||
#endif | ||
} | ||
|
||
// Construct the object with random tables | ||
template<typename SortPolicy> | ||
LSHSearch<SortPolicy>:: | ||
|
@@ -31,7 +41,7 @@ LSHSearch(const arma::mat& referenceSet, | |
secondHashSize(secondHashSize), | ||
bucketSize(bucketSize), | ||
distanceEvaluations(0), | ||
maxThreads(omp_get_max_threads()), | ||
maxThreads(DefineMaxThreads()), | ||
numThreadsUsed(1) | ||
{ | ||
// Pass work to training function. | ||
|
@@ -344,7 +354,7 @@ template<typename VecType> | |
void LSHSearch<SortPolicy>::ReturnIndicesFromTable( | ||
const VecType& queryPoint, | ||
arma::uvec& referenceIndices, | ||
size_t numTablesToSearch) const | ||
size_t numTablesToSearch) | ||
{ | ||
// Decide on the number of tables to look into. | ||
if (numTablesToSearch == 0) // If no user input is given, search all. | ||
|
@@ -406,18 +416,37 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable( | |
arma::Col<size_t> refPointsConsidered; | ||
refPointsConsidered.zeros(referenceSet->n_cols); | ||
|
||
for (size_t i = 0; i < hashVec.n_elem; ++i) | ||
// Define the number of threads used to process this. | ||
size_t numThreadsUsed = std::min(maxThreads, numTablesToSearch); | ||
|
||
// Parallelization: By default nested parallelism is off, so this won't be | ||
// parallel. The user might turn nested parallelism on if (for example) they | ||
// have a query-by-query processing scheme and so processing more than one | ||
// query at the same time doesn't make sense for them. | ||
|
||
#pragma omp parallel for \ | ||
num_threads (numThreadsUsed) \ | ||
shared (hashVec, refPointsConsidered) \ | ||
schedule(dynamic) | ||
for (size_t i = 0; i < numTablesToSearch; ++i) | ||
{ | ||
|
||
const size_t hashInd = (size_t) hashVec[i]; | ||
const size_t tableRow = bucketRowInHashTable[hashInd]; | ||
|
||
// Pick the indices in the bucket corresponding to 'hashInd'. | ||
if (tableRow != secondHashSize) | ||
{ | ||
for (size_t j = 0; j < bucketContentSize[tableRow]; j++) | ||
{ | ||
#pragma omp atomic | ||
refPointsConsidered[secondHashTable[tableRow](j)]++; | ||
} | ||
} | ||
} | ||
|
||
// Only keep reference points found in at least one bucket. | ||
// TODO: maybe write parallel implementation of this? | ||
referenceIndices = arma::find(refPointsConsidered > 0); | ||
return; | ||
} | ||
|
@@ -431,18 +460,39 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable( | |
|
||
// Retrieve candidates. | ||
size_t start = 0; | ||
|
||
// Define the number of threads used to process this. | ||
size_t numThreadsUsed = std::min(maxThreads, numTablesToSearch); | ||
|
||
// Parallelization: By default nested parallelism is off, so this won't be | ||
// parallel. The user might turn nested parallelism on if (for example) they | ||
// have a query-by-query processing scheme and so processing more than one | ||
// query at the same time doesn't make sense for them. | ||
|
||
#pragma omp parallel for \ | ||
num_threads (numThreadsUsed) \ | ||
shared (hashVec, refPointsConsideredSmall, start) \ | ||
schedule(dynamic) | ||
for (size_t i = 0; i < numTablesToSearch; ++i) // For all tables | ||
{ | ||
const size_t hashInd = (size_t) hashVec[i]; // Find the query's bucket. | ||
const size_t tableRow = bucketRowInHashTable[hashInd]; | ||
|
||
// Store all secondHashTable points in the candidates set. | ||
if (tableRow != secondHashSize) | ||
{ | ||
for (size_t j = 0; j < bucketContentSize[tableRow]; ++j) | ||
refPointsConsideredSmall(start++) = secondHashTable[tableRow][j]; | ||
{ | ||
#pragma omp critical | ||
{ | ||
refPointsConsideredSmall(start++) = secondHashTable[tableRow][j]; | ||
} | ||
} | ||
} | ||
} | ||
|
||
// Only keep unique candidates. | ||
// TODO: again main bottleneck is here. Parallelize? | ||
referenceIndices = arma::unique(refPointsConsideredSmall); | ||
return; | ||
} | ||
|
@@ -489,25 +539,16 @@ void LSHSearch<SortPolicy>::Search(const arma::mat& querySet, | |
|
||
Timer::Start("computing_neighbors"); | ||
|
||
// Parallelization allows us to process more than one query at a time. To | ||
// control workload and thread access, we use numThreadsUsed and maxThreads to | ||
// make sure we only use as many threads as the user specified. | ||
// Parallelization to process more than one query at a time. | ||
// use as many threads possible but not more than allowed number | ||
size_t numThreadsUsed = std::min( (arma::uword) maxThreads, querySet.n_cols ); | ||
#pragma omp parallel for \ | ||
if (numThreadsUsed <= maxThreads) \ | ||
num_threads (maxThreads-numThreadsUsed)\ | ||
num_threads ( numThreadsUsed )\ | ||
shared(avgIndicesReturned, resultingNeighbors, distances) \ | ||
schedule(dynamic) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Two questions---
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The problem with static scheduling is it doesn't leave room for work-stealing. Since queries get unequal sizes of candidate sets, in static scheduling some threads will finish their chunks quickly and then be useless. In dynamic scheduling, the compiler will detect slackers and give them more work to do.
Yes I think I can simplify the code more now that we're not doing nested parallelism. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. About static vs dynamic scheduling, I ran some tests: Sift100k
phy
Corel
Miniboone
In the first 3, I'd say dynamic is slightly faster. It's hard to tell for Miniboone because the standard deviation is much larger than the difference. I'll run covertype and pokerhand in a while when my PC is not used. |
||
|
||
// Go through every query point. | ||
for (size_t i = 0; i < querySet.n_cols; i++) | ||
{ | ||
// Master thread updates the number of threads used | ||
if (i == 0 && omp_get_thread_num() == 0) | ||
{ | ||
numThreadsUsed+=omp_get_num_threads(); | ||
Log::Info | ||
<< "Using "<< numThreadsUsed << " threads to process queries." << endl; | ||
} | ||
|
||
// Hash every query into every hash table and eventually into the | ||
// 'secondHashTable' to obtain the neighbor candidates. | ||
|
@@ -526,8 +567,6 @@ void LSHSearch<SortPolicy>::Search(const arma::mat& querySet, | |
BaseCase(i, (size_t) refIndices[j], querySet, resultingNeighbors, | ||
distances); | ||
} | ||
// parallel region over, reset number of threads to 1 | ||
numThreadsUsed = omp_get_num_threads(); | ||
|
||
Timer::Stop("computing_neighbors"); | ||
|
||
|
@@ -556,24 +595,16 @@ Search(const size_t k, | |
|
||
Timer::Start("computing_neighbors"); | ||
|
||
// Parallelization allows us to process more than one query at a time. To | ||
// control workload and thread access, we use numThreadsUsed and maxThreads to | ||
// make sure we only use as many threads as the user specified. | ||
// Parallelization to process more than one query at a time. | ||
// use as many threads possible but not more than allowed number | ||
size_t numThreadsUsed = std::min( (arma::uword) maxThreads, referenceSet->n_cols ); | ||
#pragma omp parallel for \ | ||
if (numThreadsUsed <= maxThreads) \ | ||
num_threads (maxThreads-numThreadsUsed)\ | ||
num_threads ( numThreadsUsed )\ | ||
shared(avgIndicesReturned, resultingNeighbors, distances) \ | ||
schedule(dynamic) | ||
// Go through every query point. | ||
for (size_t i = 0; i < referenceSet->n_cols; i++) | ||
{ | ||
// Master thread updates the number of threads used | ||
if (i == 0 && omp_get_thread_num() == 0) | ||
{ | ||
numThreadsUsed+=omp_get_num_threads(); | ||
Log::Info | ||
<< "Using "<< numThreadsUsed << " threads to process queries." << endl; | ||
} | ||
// Hash every query into every hash table and eventually into the | ||
// 'secondHashTable' to obtain the neighbor candidates. | ||
arma::uvec refIndices; | ||
|
@@ -592,10 +623,6 @@ Search(const size_t k, | |
|
||
} | ||
|
||
// parallel region over, reset number of threads to 1 | ||
numThreadsUsed = omp_get_num_threads(); | ||
|
||
|
||
Timer::Stop("computing_neighbors"); | ||
|
||
distanceEvaluations += avgIndicesReturned; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice catch :)