mlpack · rcurtin · Jul 8, 2016 · Jun 19, 2016 · Jun 19, 2016 · Jun 19, 2016
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -18,15 +18,16 @@ endif()
 
 # First, define all the compilation options.
 # We default to debugging mode for developers.
-option(DEBUG "Compile with debugging information" ON)
-option(PROFILE "Compile with profiling information" ON)
+option(DEBUG "Compile with debugging information." ON)
+option(PROFILE "Compile with profiling information." ON)
 option(ARMA_EXTRA_DEBUG "Compile with extra Armadillo debugging symbols." OFF)
 option(MATLAB_BINDINGS "Compile MATLAB bindings if MATLAB is found." OFF)
 option(TEST_VERBOSE "Run test cases with verbose output." OFF)
 option(BUILD_TESTS "Build tests." ON)
 option(BUILD_CLI_EXECUTABLES "Build command-line executables" ON)
 option(BUILD_SHARED_LIBS
-    "Compile shared libraries (if OFF, static libraries are compiled)" ON)
+    "Compile shared libraries (if OFF, static libraries are compiled)." ON)
+option(HAS_OPENMP "Use OpenMP for parallel execution." OFF)
 
 enable_testing()
 
@@ -117,6 +118,11 @@ if(ARMA_EXTRA_DEBUG)
   add_definitions(-DARMA_EXTRA_DEBUG)
 endif()
 
+# If the user has an OpenMP-enabled compiler, turn OpenMP on
+if (HAS_OPENMP)
+  add_definitions(-DHAS_OPENMP)
+endif()
+
 # Now, find the libraries we need to compile against.  Several variables can be
 # set to manually specify the directory in which each of these libraries
 # resides.

diff --git a/src/mlpack/core.hpp b/src/mlpack/core.hpp
@@ -232,6 +232,11 @@
 #include <mlpack/core/kernels/spherical_kernel.hpp>
 #include <mlpack/core/kernels/triangular_kernel.hpp>
 
+// Use OpenMP if compiled with -DHAS_OPENMP.
+#ifdef HAS_OPENMP
+  #include <omp.h>
+#endif
+
 // Use Armadillo's C++ version detection.
 #ifdef ARMA_USE_CXX11
   #define MLPACK_USE_CX11

diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp
@@ -270,7 +270,7 @@ class LSHSearch
   template<typename VecType>
   void ReturnIndicesFromTable(const VecType& queryPoint,
                               arma::uvec& referenceIndices,
-                              size_t numTablesToSearch) const;
+                              size_t numTablesToSearch);
 
   /**
    * This is a helper function that computes the distance of the query to the

diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -14,6 +14,16 @@ using std::cout; using std::endl; //TODO: remove
 namespace mlpack {
 namespace neighbor {
 
+// Simple small function to set threads to 1 if OpenMP is not used
+inline size_t DefineMaxThreads()
+{
+  #ifdef _OPENMP
+    return omp_get_max_threads();
+  #else
+    return 1;
+  #endif
+}
+
 // Construct the object with random tables
 template<typename SortPolicy>
 LSHSearch<SortPolicy>::
@@ -31,7 +41,7 @@ LSHSearch(const arma::mat& referenceSet,
   secondHashSize(secondHashSize),
   bucketSize(bucketSize),
   distanceEvaluations(0),
-  maxThreads(omp_get_max_threads()),
+  maxThreads(DefineMaxThreads()),
   numThreadsUsed(1)
 {
   // Pass work to training function.
@@ -344,7 +354,7 @@ template<typename VecType>
 void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
     const VecType& queryPoint,
     arma::uvec& referenceIndices,
-    size_t numTablesToSearch) const
+    size_t numTablesToSearch)
 {
   // Decide on the number of tables to look into.
   if (numTablesToSearch == 0) // If no user input is given, search all.
@@ -406,18 +416,37 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
     arma::Col<size_t> refPointsConsidered;
     refPointsConsidered.zeros(referenceSet->n_cols);
 
-    for (size_t i = 0; i < hashVec.n_elem; ++i)
+    // Define the number of threads used to process this.
+    size_t numThreadsUsed = std::min(maxThreads, numTablesToSearch);
+
+    // Parallelization: By default nested parallelism is off, so this won't be
+    // parallel. The user might turn nested parallelism on if (for example) they
+    // have a query-by-query processing scheme and so processing more than one
+    // query at the same time doesn't make sense for them.
+
+    #pragma omp parallel for \
+    num_threads (numThreadsUsed) \
+    shared (hashVec, refPointsConsidered) \
+    schedule(dynamic)
+    for (size_t i = 0; i < numTablesToSearch; ++i)
     {
+
       const size_t hashInd = (size_t) hashVec[i];
       const size_t tableRow = bucketRowInHashTable[hashInd];
 
       // Pick the indices in the bucket corresponding to 'hashInd'.
       if (tableRow != secondHashSize)
+      {
         for (size_t j = 0; j < bucketContentSize[tableRow]; j++)
+        {
+          #pragma omp atomic
           refPointsConsidered[secondHashTable[tableRow](j)]++;
+        }
+      }
     }
 
     // Only keep reference points found in at least one bucket.
+    // TODO: maybe write parallel implementation of this?
     referenceIndices = arma::find(refPointsConsidered > 0);
     return;
   }
@@ -431,18 +460,39 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
 
     // Retrieve candidates.
     size_t start = 0;
+
+    // Define the number of threads used to process this.
+    size_t numThreadsUsed = std::min(maxThreads, numTablesToSearch);
+
+    // Parallelization: By default nested parallelism is off, so this won't be
+    // parallel. The user might turn nested parallelism on if (for example) they
+    // have a query-by-query processing scheme and so processing more than one
+    // query at the same time doesn't make sense for them.
+
+    #pragma omp parallel for \
+    num_threads (numThreadsUsed) \
+    shared (hashVec, refPointsConsideredSmall, start) \
+    schedule(dynamic)
     for (size_t i = 0; i < numTablesToSearch; ++i) // For all tables
     {
       const size_t hashInd = (size_t) hashVec[i]; // Find the query's bucket.
       const size_t tableRow = bucketRowInHashTable[hashInd];
 
       // Store all secondHashTable points in the candidates set.
       if (tableRow != secondHashSize)
+      { 
         for (size_t j = 0; j < bucketContentSize[tableRow]; ++j)
-          refPointsConsideredSmall(start++) = secondHashTable[tableRow][j];
+        {
+          #pragma omp critical
+          {
+            refPointsConsideredSmall(start++) = secondHashTable[tableRow][j];
+          }
+        }
+      }
     }
 
     // Only keep unique candidates.
+    // TODO: again main bottleneck is here. Parallelize?
     referenceIndices = arma::unique(refPointsConsideredSmall);
     return;
   }
@@ -489,25 +539,16 @@ void LSHSearch<SortPolicy>::Search(const arma::mat& querySet,
 
   Timer::Start("computing_neighbors");
 
-  // Parallelization allows us to process more than one query at a time. To
-  // control workload and thread access, we use numThreadsUsed and maxThreads to
-  // make sure we only use as many threads as the user specified.
+  // Parallelization to process more than one query at a time.
+  // use as many threads possible but not more than allowed number
+  size_t numThreadsUsed = std::min( (arma::uword) maxThreads, querySet.n_cols );
   #pragma omp parallel for \
-    if (numThreadsUsed <= maxThreads) \
-    num_threads (maxThreads-numThreadsUsed)\
+    num_threads ( numThreadsUsed )\
     shared(avgIndicesReturned, resultingNeighbors, distances) \
     schedule(dynamic)
-
   // Go through every query point.
   for (size_t i = 0; i < querySet.n_cols; i++)
   {
-    // Master thread updates the number of threads used
-    if (i == 0 && omp_get_thread_num() == 0)
-    {
-      numThreadsUsed+=omp_get_num_threads();
-      Log::Info 
-        << "Using "<< numThreadsUsed << " threads to process queries." << endl;
-    }
 
     // Hash every query into every hash table and eventually into the
     // 'secondHashTable' to obtain the neighbor candidates.
@@ -526,8 +567,6 @@ void LSHSearch<SortPolicy>::Search(const arma::mat& querySet,
       BaseCase(i, (size_t) refIndices[j], querySet, resultingNeighbors,
           distances);
   }
-  // parallel region over, reset number of threads to 1
-  numThreadsUsed = omp_get_num_threads();
 
   Timer::Stop("computing_neighbors");
 
@@ -556,24 +595,16 @@ Search(const size_t k,
 
   Timer::Start("computing_neighbors");
 
-  // Parallelization allows us to process more than one query at a time. To
-  // control workload and thread access, we use numThreadsUsed and maxThreads to
-  // make sure we only use as many threads as the user specified.
+  // Parallelization to process more than one query at a time.
+  // use as many threads possible but not more than allowed number
+  size_t numThreadsUsed = std::min( (arma::uword) maxThreads, referenceSet->n_cols );
   #pragma omp parallel for \
-    if (numThreadsUsed <= maxThreads) \
-    num_threads (maxThreads-numThreadsUsed)\
+    num_threads ( numThreadsUsed )\
     shared(avgIndicesReturned, resultingNeighbors, distances) \
     schedule(dynamic)
   // Go through every query point.
   for (size_t i = 0; i < referenceSet->n_cols; i++)
   {
-    // Master thread updates the number of threads used
-    if (i == 0 && omp_get_thread_num() == 0)
-    {
-      numThreadsUsed+=omp_get_num_threads();
-      Log::Info 
-        << "Using "<< numThreadsUsed << " threads to process queries." << endl;
-    }
     // Hash every query into every hash table and eventually into the
     // 'secondHashTable' to obtain the neighbor candidates.
     arma::uvec refIndices;
@@ -592,10 +623,6 @@ Search(const size_t k,
 
   }
 
-  // parallel region over, reset number of threads to 1
-  numThreadsUsed = omp_get_num_threads();
-
-
   Timer::Stop("computing_neighbors");
 
   distanceEvaluations += avgIndicesReturned;

diff --git a/src/mlpack/tests/lsh_test.cpp b/src/mlpack/tests/lsh_test.cpp
@@ -499,7 +499,7 @@ BOOST_AUTO_TEST_CASE(ParallelBichromatic)
   lshTest.Search(qdata, k, sequentialNeighbors, distances);
 
   // Require both have same results
-  double recall = ComputeRecall(sequentialNeighbors, parallelNeighbors);
+  double recall = LSHSearch<>::ComputeRecall(sequentialNeighbors, parallelNeighbors);
   BOOST_REQUIRE_EQUAL(recall, 1);
 }
 
@@ -533,7 +533,7 @@ BOOST_AUTO_TEST_CASE(ParallelMonochromatic)
   lshTest.Search(k, sequentialNeighbors, distances);
 
   // Require both have same results
-  double recall = ComputeRecall(sequentialNeighbors, parallelNeighbors);
+  double recall = LSHSearch<>::ComputeRecall(sequentialNeighbors, parallelNeighbors);
   BOOST_REQUIRE_EQUAL(recall, 1);
 }