diff --git a/HISTORY.md b/HISTORY.md index cddb28a3e3c..de21f895281 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,15 @@ ### mlpack 2.0.2 ###### 2016-??-?? + * LSHSearch::Projection(size_t) that returned a single projection matrix has + been removed. In its place, LSHSearch::Projections() has been added, which + returns an arma::cube with each projection table in a slice (#663). + + * A new constructor has been added to LSHSearch that creates objects using + projection tables provided in an arma::cube (#663). + + * LSHSearch::Projections(arma::cube) has been added that allows users to + change the projection tables of an LSHSearch object (#663). + * Handle zero-variance dimensions in DET (#515). * Add MiniBatchSGD optimizer (src/mlpack/core/optimizers/minibatch_sgd/) and diff --git a/src/mlpack/core/data/serialization_template_version.hpp b/src/mlpack/core/data/serialization_template_version.hpp new file mode 100644 index 00000000000..8cf67dc121c --- /dev/null +++ b/src/mlpack/core/data/serialization_template_version.hpp @@ -0,0 +1,38 @@ +/** + * @file serialization_template_version.hpp + * @author Ryan Curtin + * + * A better version of the BOOST_CLASS_VERSION() macro that supports templated + * classes. + */ +#ifndef MLPACK_CORE_DATA_SERIALIZATION_TEMPLATE_VERSION_HPP +#define MLPACK_CORE_DATA_SERIALIZATION_TEMPLATE_VERSION_HPP + +/** + * Use this like BOOST_CLASS_VERSION(), but for templated classes. The first + * argument is the signature for the template. Here is an example for + * math::Range: + * + * BOOST_TEMPLATE_CLASS_VERSION(template, math::Range, 1); + */ +#define BOOST_TEMPLATE_CLASS_VERSION(SIGNATURE, T, N) \ +namespace boost { \ +namespace serialization { \ +template<> \ +SIGNATURE \ +struct version> \ +{ \ + typedef mpl::int_ type; \ + typedef mpl::integral_c_tag tag; \ + BOOST_STATIC_CONSTANT(int, value = version::type::value); \ + BOOST_MPL_ASSERT(( \ + boost::mpl::less< \ + boost::mpl::int_, \ + boost::mpl::int_<256> \ + > \ + )); \ +}; \ +} \ +} + +#endif diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp index 7505f29c0af..9c7c1d6438c 100644 --- a/src/mlpack/methods/lsh/lsh_search.hpp +++ b/src/mlpack/methods/lsh/lsh_search.hpp @@ -64,6 +64,31 @@ class LSHSearch * the maximum number of points that can be hashed into single bucket. * Default values are already provided here. */ + LSHSearch(const arma::mat& referenceSet, + const arma::cube& projections, + const double hashWidth = 0.0, + const size_t secondHashSize = 99901, + const size_t bucketSize = 500); + + /** + * This function initializes the LSH class. It builds the hash one the + * reference set using the provided projections. See the individual functions + * performing the hashing for details on how the hashing is done. + * + * @param referenceSet Set of reference points and the set of queries. + * @param projections Cube of projection tables. For a cube of size (a, b, c) + * we set numProj = a, numTables = c. b is the reference set + * dimensionality. + * @param hashWidth The width of hash for every table. If 0 (the default) is + * provided, then the hash width is automatically obtained by computing + * the average pairwise distance of 25 pairs. This should be a reasonable + * upper bound on the nearest-neighbor distance in general. + * @param secondHashSize The size of the second hash table. This should be a + * large prime number. + * @param bucketSize The size of the bucket in the second hash table. This is + * the maximum number of points that can be hashed into single bucket. + * Default values are already provided here. + */ LSHSearch(const arma::mat& referenceSet, const size_t numProj, const size_t numTables, @@ -83,15 +108,19 @@ class LSHSearch ~LSHSearch(); /** - * Train the LSH model on the given dataset. This means building new hash - * tables. + * Train the LSH model on the given dataset. If a correct vector is not + * provided, this means building new hash tables. Otherwise, we use the ones + * provided by the user. */ void Train(const arma::mat& referenceSet, const size_t numProj, const size_t numTables, const double hashWidth = 0.0, const size_t secondHashSize = 99901, - const size_t bucketSize = 500); + const size_t bucketSize = 500, + const arma::cube &projection + = arma::zeros(0,0,0) + ); /** * Compute the nearest neighbors of the points in the given query set and @@ -147,7 +176,7 @@ class LSHSearch * @param ar Archive to serialize to. */ template - void Serialize(Archive& ar, const unsigned int /* version */); + void Serialize(Archive& ar, const unsigned int version); //! Return the number of distance evaluations performed. size_t DistanceEvaluations() const { return distanceEvaluations; } @@ -158,9 +187,7 @@ class LSHSearch const arma::mat& ReferenceSet() const { return *referenceSet; } //! Get the number of projections. - size_t NumProjections() const { return projections.size(); } - //! Get the projection matrix of the given table. - const arma::mat& Projection(const size_t i) const { return projections[i]; } + size_t NumProjections() const { return projections.n_slices; } //! Get the offsets 'b' for each of the projections. (One 'b' per column.) const arma::mat& Offsets() const { return offsets; } @@ -174,6 +201,24 @@ class LSHSearch //! Get the second hash table. const arma::Mat& SecondHashTable() const { return secondHashTable; } + //! Get the projection tables. + const arma::cube& Projections() { return projections; } + + //! Change the projection tables (Retrains object) + void Projections(const arma::cube &projTables) + { + // Simply call Train() with given projection tables + Train( + *referenceSet, + numProj, + numTables, + hashWidth, + secondHashSize, + bucketSize, + projTables + ); + }; + private: /** * This function builds a hash table with two levels of hashing as presented @@ -188,7 +233,7 @@ class LSHSearch * are private members of this class, initialized during the class * initialization. */ - void BuildHash(); + void BuildHash(const arma::cube &projection); /** * This function takes a query and hashes it into each of the hash tables to @@ -271,8 +316,8 @@ class LSHSearch //! The number of hash tables. size_t numTables; - //! The std::vector containing the projection matrix of each table. - std::vector projections; // should be [numProj x dims] x numTables + //! The arma::cube containing the projection matrix of each table. + arma::cube projections; // should be [numProj x dims] x numTables slices //! The list of the offsets 'b' for each of the projection for each table. arma::mat offsets; // should be numProj x numTables @@ -307,6 +352,10 @@ class LSHSearch } // namespace neighbor } // namespace mlpack +//! Set the serialization version of the LSHSearch class. +BOOST_TEMPLATE_CLASS_VERSION(template, + mlpack::neighbor::LSHSearch, 1); + // Include implementation. #include "lsh_search_impl.hpp" diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp index b956ed27717..c0fc57e759b 100644 --- a/src/mlpack/methods/lsh/lsh_search_impl.hpp +++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp @@ -12,7 +12,7 @@ namespace mlpack { namespace neighbor { -// Construct the object. +// Construct the object with random tables template LSHSearch:: LSHSearch(const arma::mat& referenceSet, @@ -35,6 +35,28 @@ LSHSearch(const arma::mat& referenceSet, bucketSize); } +// Construct the object with given tables +template +LSHSearch:: +LSHSearch(const arma::mat& referenceSet, + const arma::cube& projections, + const double hashWidthIn, + const size_t secondHashSize, + const size_t bucketSize) : + referenceSet(NULL), // This will be set in Train(). + ownsSet(false), + numProj(projections.n_cols), + numTables(projections.n_slices), + hashWidth(hashWidthIn), + secondHashSize(secondHashSize), + bucketSize(bucketSize), + distanceEvaluations(0) +{ + // Pass work to training function + Train(referenceSet, numProj, numTables, hashWidthIn, secondHashSize, + bucketSize, projections); +} + // Empty constructor. template LSHSearch::LSHSearch() : @@ -65,7 +87,8 @@ void LSHSearch::Train(const arma::mat& referenceSet, const size_t numTables, const double hashWidthIn, const size_t secondHashSize, - const size_t bucketSize) + const size_t bucketSize, + const arma::cube &projection) { // Set new reference set. if (this->referenceSet && ownsSet) @@ -97,7 +120,152 @@ void LSHSearch::Train(const arma::mat& referenceSet, Log::Info << "Hash width chosen as: " << hashWidth << std::endl; - BuildHash(); + // Hash Building Procedure + // The first level hash for a single table outputs a 'numProj'-dimensional + // integer key for each point in the set -- (key, pointID) + // The key creation details are presented below + // + + // Step I: Prepare the second level hash. + + // Obtain the weights for the second hash. + secondHashWeights = arma::floor(arma::randu(numProj) * + (double) secondHashSize); + + // The 'secondHashTable' is initially an empty matrix of size + // ('secondHashSize' x 'bucketSize'). But by only filling the buckets + // as points land in them allows us to shrink the size of the + // 'secondHashTable' at the end of the hashing. + + // Fill the second hash table n = referenceSet.n_cols. This is because no + // point has index 'n' so the presence of this in the bucket denotes that + // there are no more points in this bucket. + secondHashTable.set_size(secondHashSize, bucketSize); + secondHashTable.fill(referenceSet.n_cols); + + // Keep track of the size of each bucket in the hash. At the end of hashing + // most buckets will be empty. + bucketContentSize.zeros(secondHashSize); + + // Instead of putting the points in the row corresponding to the bucket, we + // chose the next empty row and keep track of the row in which the bucket + // lies. This allows us to stack together and slice out the empty buckets at + // the end of the hashing. + bucketRowInHashTable.set_size(secondHashSize); + bucketRowInHashTable.fill(secondHashSize); + + // Keep track of number of non-empty rows in the 'secondHashTable'. + size_t numRowsInTable = 0; + + // Step II: The offsets for all projections in all tables. + // Since the 'offsets' are in [0, hashWidth], we obtain the 'offsets' + // as randu(numProj, numTables) * hashWidth. + offsets.randu(numProj, numTables); + offsets *= hashWidth; + + + + + // Step III: Obtain the 'numProj' projections for each table. + projections.clear(); // Reset projections vector. + + if (projection.n_slices == 0) //random generation of tables + { + // For L2 metric, 2-stable distributions are used, and + // the normal Z ~ N(0, 1) is a 2-stable distribution. + + //numTables random tables arranged in a cube + projections.randn( + referenceSet.n_rows, + numProj, + numTables + ); + } + else if (projection.n_slices == numTables) //user defined tables + { + projections = projection; + } + else //invalid argument + { + throw std::invalid_argument( + "number of projection tables provided must be equal to numProj" + ); + } + + + for (size_t i = 0; i < numTables; i++) + { + // Step IV: create the 'numProj'-dimensional key for each point in each + // table. + + // The following code performs the task of hashing each point to a + // 'numProj'-dimensional integer key. Hence you get a ('numProj' x + // 'referenceSet.n_cols') key matrix. + // + // For a single table, let the 'numProj' projections be denoted by 'proj_i' + // and the corresponding offset be 'offset_i'. Then the key of a single + // point is obtained as: + // key = { floor( ( + offset_i) / 'hashWidth' ) forall i } + arma::mat offsetMat = arma::repmat(offsets.unsafe_col(i), 1, + referenceSet.n_cols); + arma::mat hashMat = projections.slice(i).t() * (referenceSet); + hashMat += offsetMat; + hashMat /= hashWidth; + + // Step V: Putting the points in the 'secondHashTable' by hashing the key. + // Now we hash every key, point ID to its corresponding bucket. + arma::rowvec secondHashVec = secondHashWeights.t() * arma::floor(hashMat); + + // This gives us the bucket for the corresponding point ID. + for (size_t j = 0; j < secondHashVec.n_elem; j++) + secondHashVec[j] = (double)((size_t) secondHashVec[j] % secondHashSize); + + Log::Assert(secondHashVec.n_elem == referenceSet.n_cols); + + // Insert the point in the corresponding row to its bucket in the + // 'secondHashTable'. + for (size_t j = 0; j < secondHashVec.n_elem; j++) + { + // This is the bucket number. + size_t hashInd = (size_t) secondHashVec[j]; + // The point ID is 'j'. + + // If this is currently an empty bucket, start a new row keep track of + // which row corresponds to the bucket. + if (bucketContentSize[hashInd] == 0) + { + // Start a new row for hash. + bucketRowInHashTable[hashInd] = numRowsInTable; + secondHashTable(numRowsInTable, 0) = j; + + numRowsInTable++; + } + + else + { + // If bucket is already present in the 'secondHashTable', find the + // corresponding row and insert the point ID in this row unless the + // bucket is full, in which case, do nothing. + if (bucketContentSize[hashInd] < bucketSize) + secondHashTable(bucketRowInHashTable[hashInd], + bucketContentSize[hashInd]) = j; + } + + // Increment the count of the points in this bucket. + if (bucketContentSize[hashInd] < bucketSize) + bucketContentSize[hashInd]++; + } // Loop over all points in the reference set. + } // Loop over tables. + + // Step VI: Condensing the 'secondHashTable'. + size_t maxBucketSize = 0; + for (size_t i = 0; i < bucketContentSize.n_elem; i++) + if (bucketContentSize[i] > maxBucketSize) + maxBucketSize = bucketContentSize[i]; + + Log::Info << "Final hash table size: (" << numRowsInTable << " x " + << maxBucketSize << ")" << std::endl; + secondHashTable.resize(numRowsInTable, maxBucketSize); } template @@ -205,7 +373,8 @@ void LSHSearch::ReturnIndicesFromTable( // Compute the projection of the query in each table. arma::mat allProjInTables(numProj, numTablesToSearch); for (size_t i = 0; i < numTablesToSearch; i++) - allProjInTables.unsafe_col(i) = projections[i].t() * queryPoint; + //allProjInTables.unsafe_col(i) = projections[i].t() * queryPoint; + allProjInTables.unsafe_col(i) = projections.slice(i).t() * queryPoint; allProjInTables += offsets.cols(0, numTablesToSearch - 1); allProjInTables /= hashWidth; @@ -355,152 +524,14 @@ Search(const size_t k, } template -void LSHSearch::BuildHash() +void LSHSearch::BuildHash(const arma::cube &projection) { - // The first level hash for a single table outputs a 'numProj'-dimensional - // integer key for each point in the set -- (key, pointID) - // The key creation details are presented below - // - // The second level hash is performed by hashing the key to - // an integer in the range [0, 'secondHashSize'). - // - // This is done by creating a weight vector 'secondHashWeights' of - // length 'numProj' with each entry an integer randomly chosen - // between [0, 'secondHashSize'). - // - // Then the bucket for any key and its corresponding point is - // given by % 'secondHashSize' - // and the corresponding point ID is put into that bucket. - - // Step I: Prepare the second level hash. - - // Obtain the weights for the second hash. - secondHashWeights = arma::floor(arma::randu(numProj) * - (double) secondHashSize); - - // The 'secondHashTable' is initially an empty matrix of size - // ('secondHashSize' x 'bucketSize'). But by only filling the buckets - // as points land in them allows us to shrink the size of the - // 'secondHashTable' at the end of the hashing. - - // Fill the second hash table n = referenceSet.n_cols. This is because no - // point has index 'n' so the presence of this in the bucket denotes that - // there are no more points in this bucket. - secondHashTable.set_size(secondHashSize, bucketSize); - secondHashTable.fill(referenceSet->n_cols); - - // Keep track of the size of each bucket in the hash. At the end of hashing - // most buckets will be empty. - bucketContentSize.zeros(secondHashSize); - - // Instead of putting the points in the row corresponding to the bucket, we - // chose the next empty row and keep track of the row in which the bucket - // lies. This allows us to stack together and slice out the empty buckets at - // the end of the hashing. - bucketRowInHashTable.set_size(secondHashSize); - bucketRowInHashTable.fill(secondHashSize); - - // Keep track of number of non-empty rows in the 'secondHashTable'. - size_t numRowsInTable = 0; - - // Step II: The offsets for all projections in all tables. - // Since the 'offsets' are in [0, hashWidth], we obtain the 'offsets' - // as randu(numProj, numTables) * hashWidth. - offsets.randu(numProj, numTables); - offsets *= hashWidth; - - // Step III: Create each hash table in the first level hash one by one and - // putting them directly into the 'secondHashTable' for memory efficiency. - projections.clear(); // Reset projections vector. - for (size_t i = 0; i < numTables; i++) - { - // Step IV: Obtain the 'numProj' projections for each table. - - // For L2 metric, 2-stable distributions are used, and - // the normal Z ~ N(0, 1) is a 2-stable distribution. - arma::mat projMat; - projMat.randn(referenceSet->n_rows, numProj); - - // Save the projection matrix for querying. - projections.push_back(projMat); - - // Step V: create the 'numProj'-dimensional key for each point in each - // table. - - // The following code performs the task of hashing each point to a - // 'numProj'-dimensional integer key. Hence you get a ('numProj' x - // 'referenceSet.n_cols') key matrix. - // - // For a single table, let the 'numProj' projections be denoted by 'proj_i' - // and the corresponding offset be 'offset_i'. Then the key of a single - // point is obtained as: - // key = { floor( ( + offset_i) / 'hashWidth' ) forall i } - arma::mat offsetMat = arma::repmat(offsets.unsafe_col(i), 1, - referenceSet->n_cols); - arma::mat hashMat = projMat.t() * (*referenceSet); - hashMat += offsetMat; - hashMat /= hashWidth; - - // Step VI: Putting the points in the 'secondHashTable' by hashing the key. - // Now we hash every key, point ID to its corresponding bucket. - arma::rowvec secondHashVec = secondHashWeights.t() * arma::floor(hashMat); - - // This gives us the bucket for the corresponding point ID. - for (size_t j = 0; j < secondHashVec.n_elem; j++) - secondHashVec[j] = (double)((size_t) secondHashVec[j] % secondHashSize); - - Log::Assert(secondHashVec.n_elem == referenceSet->n_cols); - - // Insert the point in the corresponding row to its bucket in the - // 'secondHashTable'. - for (size_t j = 0; j < secondHashVec.n_elem; j++) - { - // This is the bucket number. - size_t hashInd = (size_t) secondHashVec[j]; - // The point ID is 'j'. - - // If this is currently an empty bucket, start a new row keep track of - // which row corresponds to the bucket. - if (bucketContentSize[hashInd] == 0) - { - // Start a new row for hash. - bucketRowInHashTable[hashInd] = numRowsInTable; - secondHashTable(numRowsInTable, 0) = j; - - numRowsInTable++; - } - - else - { - // If bucket is already present in the 'secondHashTable', find the - // corresponding row and insert the point ID in this row unless the - // bucket is full, in which case, do nothing. - if (bucketContentSize[hashInd] < bucketSize) - secondHashTable(bucketRowInHashTable[hashInd], - bucketContentSize[hashInd]) = j; - } - - // Increment the count of the points in this bucket. - if (bucketContentSize[hashInd] < bucketSize) - bucketContentSize[hashInd]++; - } // Loop over all points in the reference set. - } // Loop over tables. - - // Step VII: Condensing the 'secondHashTable'. - size_t maxBucketSize = 0; - for (size_t i = 0; i < bucketContentSize.n_elem; i++) - if (bucketContentSize[i] > maxBucketSize) - maxBucketSize = bucketContentSize[i]; - - Log::Info << "Final hash table size: (" << numRowsInTable << " x " - << maxBucketSize << ")" << std::endl; - secondHashTable.resize(numRowsInTable, maxBucketSize); } template template void LSHSearch::Serialize(Archive& ar, - const unsigned int /* version */) + const unsigned int version) { using data::CreateNVP; @@ -518,9 +549,24 @@ void LSHSearch::Serialize(Archive& ar, // Delete existing projections, if necessary. if (Archive::is_loading::value) - projections.clear(); + projections.reset(); + + // Backward compatibility: older version of LSHSearch stored the projection + // tables in a std::vector. + if (version == 0) + { + std::vector tmpProj; + ar & CreateNVP(tmpProj, "projections"); + + projections.set_size(tmpProj[0].n_rows, tmpProj[0].n_cols, tmpProj.size()); + for (size_t i = 0; i < tmpProj.size(); ++i) + projections.slice(i) = tmpProj[i]; + } + else + { + ar & CreateNVP(projections, "projections"); + } - ar & CreateNVP(projections, "projections"); ar & CreateNVP(offsets, "offsets"); ar & CreateNVP(hashWidth, "hashWidth"); ar & CreateNVP(secondHashSize, "secondHashSize"); diff --git a/src/mlpack/prereqs.hpp b/src/mlpack/prereqs.hpp index 02828929210..3852a6b561c 100644 --- a/src/mlpack/prereqs.hpp +++ b/src/mlpack/prereqs.hpp @@ -65,6 +65,7 @@ #define BOOST_PFTO #endif #include +#include // Now include Armadillo through the special mlpack extensions. #include diff --git a/src/mlpack/tests/serialization_test.cpp b/src/mlpack/tests/serialization_test.cpp index 9bddbc2c16d..1753e672d61 100644 --- a/src/mlpack/tests/serialization_test.cpp +++ b/src/mlpack/tests/serialization_test.cpp @@ -1210,8 +1210,8 @@ BOOST_AUTO_TEST_CASE(LSHTest) BOOST_REQUIRE_EQUAL(lsh.NumProjections(), binaryLsh.NumProjections()); for (size_t i = 0; i < lsh.NumProjections(); ++i) { - CheckMatrices(lsh.Projection(i), xmlLsh.Projection(i), - textLsh.Projection(i), binaryLsh.Projection(i)); + CheckMatrices(lsh.Projections().slice(i), xmlLsh.Projections().slice(i), + textLsh.Projections().slice(i), binaryLsh.Projections().slice(i)); } CheckMatrices(lsh.ReferenceSet(), xmlLsh.ReferenceSet(),