From 8381adc17ea07a73d3ce87e0f2b48a8baca18c15 Mon Sep 17 00:00:00 2001 From: Yannis Mentekidis Date: Wed, 27 Jul 2016 11:36:34 +0100 Subject: [PATCH 01/18] Skeleton of Project --- src/mlpack/methods/CMakeLists.txt | 1 + src/mlpack/methods/lsh/lsh_search.hpp | 1 + src/mlpack/methods/lsh_model/CMakeLists.txt | 19 +++++++ src/mlpack/methods/lsh_model/lshmodel.hpp | 55 +++++++++++++++++++ .../methods/lsh_model/lshmodel_impl.hpp | 1 + .../methods/lsh_model/lshmodel_main.cpp | 15 +++++ 6 files changed, 92 insertions(+) create mode 100644 src/mlpack/methods/lsh_model/CMakeLists.txt create mode 100644 src/mlpack/methods/lsh_model/lshmodel.hpp create mode 100644 src/mlpack/methods/lsh_model/lshmodel_impl.hpp create mode 100644 src/mlpack/methods/lsh_model/lshmodel_main.cpp diff --git a/src/mlpack/methods/CMakeLists.txt b/src/mlpack/methods/CMakeLists.txt index dbbd2318bee..6a098340339 100644 --- a/src/mlpack/methods/CMakeLists.txt +++ b/src/mlpack/methods/CMakeLists.txt @@ -36,6 +36,7 @@ set(DIRS local_coordinate_coding logistic_regression lsh + lsh_model # mvu matrix_completion naive_bayes diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp index 4e6cc97b3d9..62aa64b8542 100644 --- a/src/mlpack/methods/lsh/lsh_search.hpp +++ b/src/mlpack/methods/lsh/lsh_search.hpp @@ -1,6 +1,7 @@ /** * @file lsh_search.hpp * @author Parikshit Ram + * @author Yannis Mentekidis * * Defines the LSHSearch class, which performs an approximate * nearest neighbor search for a queries in a query set diff --git a/src/mlpack/methods/lsh_model/CMakeLists.txt b/src/mlpack/methods/lsh_model/CMakeLists.txt new file mode 100644 index 00000000000..c3799753aec --- /dev/null +++ b/src/mlpack/methods/lsh_model/CMakeLists.txt @@ -0,0 +1,19 @@ +# Define the files we need to compile. +# Anything not in this list will not be compiled into mlpack. +set(SOURCES + # LSH-model class + lshmodel.hpp + lshmodel_impl.hpp +) + +# Add directory name to sources. +set(DIR_SRCS) +foreach(file ${SOURCES}) + set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) +endforeach() +# Append sources (with directory name) to list of all mlpack sources (used at +# the parent scope). +set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) + +# The code that models LSH to return a set of parameters that works well. +add_cli_executable(lshmodel) diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp new file mode 100644 index 00000000000..073f9576a6b --- /dev/null +++ b/src/mlpack/methods/lsh_model/lshmodel.hpp @@ -0,0 +1,55 @@ +/** + * @file lshmodel.hpp + * @author Yannis Mentekidis + * + * Defines the LSHModel class, which models the Locality Sensitive Hashing + * algorithm. The model identifies parameter sets that produce satisfactory + * results while keeping execution time low. + * + * The model was proposed by Dong et al in the following paper. + * + * @code + * @article{Dong2008LSHModel, + * author = {Dong, Wei and Wang, Zhe and Josephson, William and Charikar, + * Moses and Li, Kai}, + * title = {{Modeling LSH for performance tuning}}, + * journal = {Proceeding of the 17th ACM conference on Information and + * knowledge mining - CIKM '08}, + * pages = {669}, + * url = {http://portal.acm.org/citation.cfm?doid=1458082.1458172}, + * year = {2008} + * } + * @endcode + * + * We use a different method to fit Gamma Distributions to pairwise distances. + * Instead of the MLE method proposed in the paper above, we use the mlpack + * class GammaDistribution, which implements fitting according to Thomas Minka's + * work. + * + * @code + * @techreport{minka2002estimating, + * title={Estimating a {G}amma distribution}, + * author={Minka, Thomas P.}, + * institution={Microsoft Research}, + * address={Cambridge, U.K.}, + * year={2002} + * } + * @endcode + */ + + +namespace mlpack { +namespace neighbor { + +class LSHModel +{ + public: + + + private: + + +}; // class LSHModel. + +} // namespace neighbor. +} // namespace mlpack. diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp new file mode 100644 index 00000000000..a267c74ae20 --- /dev/null +++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp @@ -0,0 +1 @@ +#include "lshmodel.hpp" diff --git a/src/mlpack/methods/lsh_model/lshmodel_main.cpp b/src/mlpack/methods/lsh_model/lshmodel_main.cpp new file mode 100644 index 00000000000..e28d2644bbf --- /dev/null +++ b/src/mlpack/methods/lsh_model/lshmodel_main.cpp @@ -0,0 +1,15 @@ +#include + +#include "lshmodel.hpp" +using namespace mlpack; + +PROGRAM_INFO("LSH Model (TODO: Complete this)", ""); + +PARAM_STRING_IN("reference_file", "File containing the dataset", "r", ""); +PARAM_STRING_OUT("output_model_file", "File to save trained LSH model to", "m"); + +int main(int argc, char* argv[]) +{ + std::cout << "Hello!" << std::endl; + return 0; +} From aa6a123819944a88b1d501be33ce15a00feb966a Mon Sep 17 00:00:00 2001 From: Yannis Mentekidis Date: Tue, 2 Aug 2016 14:44:08 +0100 Subject: [PATCH 02/18] Almost complete LSHModel skeleton --- src/mlpack/methods/lsh_model/lshmodel.hpp | 239 ++++++++++++++++- .../methods/lsh_model/lshmodel_impl.hpp | 242 ++++++++++++++++++ .../methods/lsh_model/lshmodel_main.cpp | 12 +- 3 files changed, 489 insertions(+), 4 deletions(-) diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp index 073f9576a6b..c6ba93d4500 100644 --- a/src/mlpack/methods/lsh_model/lshmodel.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel.hpp @@ -5,15 +5,15 @@ * Defines the LSHModel class, which models the Locality Sensitive Hashing * algorithm. The model identifies parameter sets that produce satisfactory * results while keeping execution time low. - * + * * The model was proposed by Dong et al in the following paper. * * @code * @article{Dong2008LSHModel, - * author = {Dong, Wei and Wang, Zhe and Josephson, William and Charikar, + * author = {Dong, Wei and Wang, Zhe and Josephson, William and Charikar, * Moses and Li, Kai}, * title = {{Modeling LSH for performance tuning}}, - * journal = {Proceeding of the 17th ACM conference on Information and + * journal = {Proceeding of the 17th ACM conference on Information and * knowledge mining - CIKM '08}, * pages = {669}, * url = {http://portal.acm.org/citation.cfm?doid=1458082.1458172}, @@ -37,19 +37,252 @@ * @endcode */ +#ifndef MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_MODEL_HPP +#define MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_MODEL_HPP + +// For returning LSHSearch objects. +#include +#include +#include namespace mlpack { namespace neighbor { +template class LSHModel { public: + //! Empty Constructor. Do nothing + LSHModel(){ /* Do nothing. */ }; + + /** Parameterized Constructor. This function initializes the object and + * trains it with the provided reference set. + * + * @param referenceSet The data that will be used as a reference set for LSH + * to run queries against. We will fit distributions based on this data + * and produce good parameters for it. + * @param minRecall The minimum recall we want to guarantee. The parameters + * we will estimate will try to keep average recall of LSH above this. + * Must be in [0, 1). + * @param sampleSize The percentage of the reference set to sample for the + * estimation. Naive all-kNN will be run on this sample, so if it is too + * big, training will be very slow. Must be in [0, 1) + * @param k The number of nearest neighbors wanted for each query. + */ + LSHModel( + const arma::mat &referenceSet, + const double minRecall, + const double sampleSize, + const size_t k); + + //! Destructor. If we own any memory, free it. + ~LSHModel(); + + /** + * Trains the LSHModel. Fits distributions using referenceSet and then looks + * for LSH parameters that would return recalls larger than minRecall in the + * lowest cost (selectivity) possible. + * + * The model can estimate good values for the parameters: + * * numProj: Number of projections per projection table. + * * numTables: Number of projection tables. + * * hashWidth: Hash width of the LSH hash. + * * numProbes: Number of probes for multiprobe LSH. + * + * Train stores the computed parameters in the LSHModel object's variables. + * + * @param referenceSet The data that will be used as a reference set for LSH + * to run queries against. We will fit distributions based on this data + * and produce good parameters for it. + * @param minRecall The minimum recall we want to guarantee. The parameters + * we will estimate will try to keep average recall of LSH above this. + * Must be in [0, 1). + * @param sampleSize The percentage of the reference set to sample for the + * estimation. Naive all-kNN will be run on this sample, so if it is too + * big, training will be very slow. Must be in [0, 1) + * @param k The number of nearest neighbors wanted for each query. + */ + void Train( + const arma::mat &referenceSet, + const double minRecall, + const double sampleSize, + const size_t k); + + /** + * This function returns an LSHSearch object trained with the parameters + * calculated when the LSHModel was trained. + * If any of the parameters we trained for (numProj, numTables, hashWidth) + * are specified, we will not used the trained but the provided parameters. + * If these are left to default (0), the estimated parameters will be used. + * + * @param numProjIn The number of projections per table. + * @param numTablesIn The number of projection tables. + * @param hashWidthIn The first level hash width. + * @param secondHashSize The second level hash width. + * @param bucketSize The second level bucket size. + */ + LSHSearch* LSHObject( + const size_t numProjIn = 0, + const size_t numTablesIn = 0, + const double hashWidthIn = 0.0, + const size_t secondHashSize = 99901, + const size_t bucketSize = 500); + + //! Return the number of projections calculated. + size_t NumProj(void) const { return numProj; }; + + //! Return the number of tables calculated. + size_t NumTables(void) const { return numTables; }; + + //! Return the calculated hash width. + double HashWidth(void) const { return hashWidth; }; + + //! Return the calculated number of probes. + double NumProbes(void) const { return numProbes; }; + + //! Return the reference set. + const arma::mat ReferenceSet(void) const {return *referenceSet; }; + + //! Serialize the LSHModel object. + template + void Serialize(Archive& ar); private: + /** + * This is a helper class that uses the function a * k^b * N^c for some + * parameters a, b, c that have been fit to either predict the arithmetic or + * geometric mean of the squared distance of a point to its k-nearest + * neighbor, given some dataset size N. + */ + class DistanceStatisticPredictor + { + public: + //! Empty constructor. + DistanceStatisticPredictor() { }; + + //! Construct with training set. + DistanceStatisticPredictor(const arma::Col& inputSize, + const arma::vec& statistic, + size_t k) + : k(k) + { Train(inputSize, statistic); }; + + //! Default destructor. + ~DistanceStatisticPredictor() { }; + + /** + * Function that fits the alpha, beta and gamma parameters. + * + * @param inputSize A vector of input sizes. The input variable of the + * regression. + * @param statistic A vector of responses - the value of the statistic for + * each given inputSize. + */ + void Train(const arma::Col& inputSize, const arma::vec& statistic); + + /** + * Evaluate the statistic for a given dataset size. + * + * @param N - a new input size for which to evaluate the expected + * statistic. + */ + double Predict(size_t N) + { return alpha * std::pow(k, beta) * std::pow(N, gamma); }; + + //! Set the alpha parameter. + void Alpha(double a) { alpha = a; }; + + //! Get the alpha parameter. + double Alpha(void) { return alpha; }; + + //! Set the beta parameter. + void Beta(double b) { beta = b; }; + + //! Get the beta parameter. + double Beta(void) { return beta; }; + + //! Set the gamma parameter. + void Gamma(double c) { gamma = c; }; + + //! Get the gamma parameter. + double Gamma(void) { return gamma; }; + + //! Set the k parameter. + void K(double kIn) { k = kIn; }; + + //! Get the k parameter. + double K(void) { return k; }; + + private: + double alpha; + double beta; + double gamma; + double k; + }; + + //! Vector of DistanceStatisticPredictors for arithmetic mean. + std::vector aMeanPredictors; + + //! Vector of DistanceStatisticPredictors for geometric mean. + std::vector gMeanPredictors; + + /** + * Function that fits two DistanceStatisticPredictors for each k - one + * to predict arithmetic mean and one to preduct geometric mean. + * + * @param referenceSizes The number of reference points for each kNN search. + * @param Ek The arithmetic mean of the squared distances of a point and its + * k-nearest neighbor. One column per k. + * @param Gk The geometric mean of the squared distances of a point and its + * k-nearest neighbor. One column per k. + */ + void ApproximateKNNStatistics( + arma::Col referenceSizes, + arma::mat Ek, + arma::mat Gk); + + //! Flag that tracks if we own the reference set. + bool ownsSet; + + //! Flag that tracks if we own an LSHSearch object. + bool ownsLSHObject; + + //! Number of projections per table. + size_t numProj; + + //! Number of projection tables. + size_t numTables; + + //! First-level hash width. + double hashWidth; + + //! Number of probes for multiprobe LSH. + size_t numProbes; + + //! Reference dataset. + const arma::mat* referenceSet; + + //! Vector of LSHSearch objects. + std::vector< LSHSearch > lshObjectVector; + + //! Statistic: average squared distance of points. + double meanDist; + + //! Statistic: logarithm of squared distance of points. + double logMeanDist; + + //! Statisitc: average of logarithm of squared distances of points. + double meanLogDist; + }; // class LSHModel. } // namespace neighbor. } // namespace mlpack. + +// Include the class implementation. +#include "lshmodel_impl.hpp" + +#endif diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp index a267c74ae20..1df2ac0c3ad 100644 --- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp @@ -1 +1,243 @@ +/** + * @file lshmodel_impl.hpp + * @author Yannis Mentekidis + * + * Implementation of the LSHModel functions. + */ +#ifndef MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_MODEL_IMPL_HPP +#define MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_MODEL_IMPL_HPP + #include "lshmodel.hpp" + + +//TODO: remove +using std::cout; +using std::flush; +using std::endl; + +namespace mlpack { +namespace neighbor { + +// Constructor sets variables and trains the object. +template +LSHModel::LSHModel(const arma::mat &referenceSet, + const double minRecall, + const double sampleSize, + const size_t k) +{ + // We don't own the set - we just point to it. + ownsSet = false; + this->referenceSet = &referenceSet; + + Train(referenceSet, minRecall, sampleSize, k); +} + +// Destructor must de-allocate any referenceSet and LSHSearch objects we own. +template +LSHModel::~LSHModel() +{ + if (ownsSet) + delete referenceSet; +}; + +// Trains the object. +template +void LSHModel::Train(const arma::mat &referenceSet, + const double minRecall, + const double sampleSize, + const size_t k) +{ + // TODO: Implement + + // Sanity Check: Verify that recall and sampleSize are in [0, 1). + if (minRecall >= 1 || minRecall < 0) + throw std::runtime_error("Recall must be floating point number in [0, 1)"); + + if (sampleSize > 1 || sampleSize <= 0) + throw std::runtime_error( + "Sampling rate must be floating point number in (0, 1]"); + + const size_t numPoints = referenceSet.n_cols; // Points in original set. + + // Step 1. Select a random sample of the dataset. We will work with only that + // sample. + arma::vec sampleHelper(referenceSet.n_cols, arma::fill::randu); + + // Keep a sample of the dataset. Shuffle to be impartial (in case reference + // set is sorted). + arma::mat sampleSet = arma::shuffle(referenceSet.cols( + // We have uniformly random numbers in [0, 1], so we expect about + // N*sampleSize of them to be in [0, sampleSize). + arma::find(sampleHelper < sampleSize) + )); + const size_t numSamples = sampleSet.n_cols; // Points in sampled set. + + Log::Info << "Sampled " << numSamples << " points to train with." << std::endl; + + // Step 2. Compute all-vs-all distances of points in the sample. + // The distance matrix is symmetric, so we only compute elements above the + // diagonal. There are (N * (N - 1)) / 2 such elements. + Timer::Start("pairwise_distances"); + arma::vec distances(numSamples * (numSamples - 1) / 2); + size_t d = 0; // Index of where to store next. + for (size_t i = 0; i < numSamples; ++i) + for (size_t j = i + 1; j < numSamples; ++j) + distances(d++) = metric::EuclideanDistance::Evaluate( + sampleSet.unsafe_col(i), sampleSet.unsafe_col(j)); + Log::Info << "Computed " << d << " pointwise distances." << std::endl; + Timer::Stop("pairwise_distances"); + + // Step 3. Estimate statistics of these distances: log(mean(d)), mean(log(d)), + // mean(d). + distances = arma::pow(distances, 2); + meanDist = arma::mean(distances); + logMeanDist = std::log(meanDist); + meanLogDist = arma::mean(arma::log(distances)); + + // Step 4. Select a small part of the sample as 'anchor points'. Use the rest + // of the sample as the reference set. Find the k-Nearest Neighbors' distances + // from the anchor points for increasing portion of the reference set. Compute + // the arithmetic and geometric mean of distances from each anchor to its + // k-Nearest Neighbor. + // The geometric mean of N numbers is the Nth root of the product of the + // numbers. Through logarithmic properties though, this becomes computable + // through exponentiating the mean of the logarithms of x: + // mean(log(x)) = geometricmean(x). + + // Number of samples to create for modeling the Gamma Distributions + size_t regressionExamples = 50; // TODO: parameter? + // Number of points to use as queries. + size_t numAnchors = (size_t) std::round(0.1 * numSamples); + arma::mat queryMat = sampleSet.cols(0, numAnchors - 1); + // Evenly spaced sample sizes. + arma::Col referenceSizes = arma::conv_to< arma::Col >::from( + arma::linspace(numAnchors, numSamples - numAnchors - 1, + regressionExamples)); + + // Statistics - Arithmetic and geometric means for growing reference set. + // Compute one of each for each k. + arma::mat Ek(regressionExamples, k); + arma::mat Gk(regressionExamples, k); + + Timer::Start("neighbors_distances"); + // For each referenceSize, calculate the kNN of the anchors + for (size_t i = 0; i < regressionExamples; ++i) + { + // TODO: Since we've already computed this, avoid calling kNN? + // Reference set for kNN + arma::mat refMat = sampleSet.cols(numAnchors, numAnchors + referenceSizes(i) ); + + arma::Mat neighbors; // Not going to be used but required. + arma::mat kNNDistances; // What we need. + KNN naive(refMat, true); // true: train and use naive kNN. + naive.Search(queryMat, k, neighbors, kNNDistances); + kNNDistances = arma::pow(kNNDistances, 2); + + // Compute Arithmetic and Geometric mean of the distances. + Ek.row(i) = arma::mean(kNNDistances.t()); + Gk.row(i) = arma::exp(arma::mean(arma::log(kNNDistances.t()), 0)); + } + Timer::Stop("neighbors_distances"); + + // Step 5. Model the arithmetic and geometric mean according to the paper. + // This will produce 6 parameters (aE, bE, cE, aG, bG, cG) for each value of k + // from 1 to the k specified by the user. + ApproximateKNNStatistics(referenceSizes, Ek, Gk); + + // Step 6. Fit Gamma distributions to pairwise distances and kNN distances, + // generated or estimated in steps 3 and 5. + + // Step 7. Run Binary search on parameter space to minimize selectivity while + // keeping recall above minimum. +} + +// Fit two predictors for each k. +template +void LSHModel::ApproximateKNNStatistics( + arma::Col referenceSizes, + arma::mat Ek, + arma::mat Gk) +{ + size_t k = Ek.n_cols; + + // Clear vectors and set them to correct size. + aMeanPredictors.clear(); + gMeanPredictors.clear(); + aMeanPredictors.resize(k); + gMeanPredictors.resize(k); + + // Fit two predictors per value of k. + for (size_t i = 0; i < k; ++i) + { + aMeanPredictors[i] = DistanceStatisticPredictor( + referenceSizes, Ek.col(i), i); + gMeanPredictors[i] = DistanceStatisticPredictor( + referenceSizes, Gk.col(i), i); + } +} + +// Construct and return an LSH object. +template +LSHSearch* LSHModel::LSHObject(const size_t numProjIn, + const size_t numTablesIn, + const double hashWidthIn, + const size_t secondHashSize, + const size_t bucketSize) +{ + // Values for the object to be created with (specified by user or default). + size_t numProjOut = numProjIn; + size_t numTablesOut = numTablesIn; + double hashWidthOut = hashWidthIn; + + // If not specified by user, set these to the ones we trained for. + if (numProjIn == 0) + numProjOut = this->numProj; + + if (numTablesIn == 0) + numTablesOut = this->numTables; + + if (hashWidthOut == 0.0) + hashWidthOut = this->hashWidth; + + std::cout << *referenceSet; + + //TODO This causes a bad_alloc... I'm doing something wrong with the + //referenceSet. + /* + // Construct an object and return it. + LSHSearch<>* lshObject = new LSHSearch<>( + *referenceSet, numProjOut, numTablesOut, hashWidthOut, + secondHashSize, bucketSize); + return lshObject; + */ + + LSHSearch<>* lshObject = new LSHSearch<>(); + return lshObject; + + +} + +// Fit a curve to the data provided. +template +void LSHModel::DistanceStatisticPredictor::Train( + const arma::Col& inputSize, + const arma::vec& statistic) +{ + Log::Warn << "Not implemented yet! " << std::endl; + + alpha = beta = gamma = 1; + beta++; + alpha+=2; +} + +// Serialize the object and save to a file. +template +template +void LSHModel::Serialize(Archive& ar) +{ + //TODO: implement this. +} +} // Namespace neighbor. +} // Namespace mlpack. + +#endif diff --git a/src/mlpack/methods/lsh_model/lshmodel_main.cpp b/src/mlpack/methods/lsh_model/lshmodel_main.cpp index e28d2644bbf..d5253a9a7e9 100644 --- a/src/mlpack/methods/lsh_model/lshmodel_main.cpp +++ b/src/mlpack/methods/lsh_model/lshmodel_main.cpp @@ -1,7 +1,10 @@ #include +#include #include "lshmodel.hpp" + using namespace mlpack; +using namespace mlpack::neighbor; PROGRAM_INFO("LSH Model (TODO: Complete this)", ""); @@ -10,6 +13,13 @@ PARAM_STRING_OUT("output_model_file", "File to save trained LSH model to", "m"); int main(int argc, char* argv[]) { - std::cout << "Hello!" << std::endl; + CLI::ParseCommandLine(argc, argv); + + // Generate a random point set. + size_t N = 5000; + size_t d = 10; + arma::mat A(d, N, arma::fill::randu); + LSHModel<> model(A, 0.7, 0.25, 2); + return 0; } From 8d7cb38ba8369e838891215f168a94f3cd7789bd Mon Sep 17 00:00:00 2001 From: Yannis Mentekidis Date: Thu, 4 Aug 2016 16:02:22 +0100 Subject: [PATCH 03/18] Implements modeling of arithmetic and geometric mean of distances --- src/mlpack/methods/lsh_model/lshmodel.hpp | 67 ++++--- .../methods/lsh_model/lshmodel_impl.hpp | 98 +++++----- .../methods/lsh_model/objectivefunction.hpp | 171 ++++++++++++++++++ 3 files changed, 265 insertions(+), 71 deletions(-) create mode 100644 src/mlpack/methods/lsh_model/objectivefunction.hpp diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp index c6ba93d4500..55b4ae03479 100644 --- a/src/mlpack/methods/lsh_model/lshmodel.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel.hpp @@ -42,13 +42,21 @@ // For returning LSHSearch objects. #include +// For template parameters and kNN search (if nescessary). #include #include +// For curve fitting. +#include +// Default objective function. +#include "objectivefunction.hpp" namespace mlpack { namespace neighbor { -template +template < + typename SortPolicy = NearestNeighborSort, + typename ObjectiveFunction = DefaultObjectiveFunction + > class LSHModel { public: @@ -154,7 +162,7 @@ class LSHModel * This is a helper class that uses the function a * k^b * N^c for some * parameters a, b, c that have been fit to either predict the arithmetic or * geometric mean of the squared distance of a point to its k-nearest - * neighbor, given some dataset size N. + * neighbor, given some dataset size N and its k-nearest neighbor. */ class DistanceStatisticPredictor { @@ -162,12 +170,20 @@ class LSHModel //! Empty constructor. DistanceStatisticPredictor() { }; - //! Construct with training set. + /** + * Function to construct with training set. + * + * @param inputSize A vector of input sizes. The first input variable of + * the regression. + * @param kValues A vector of k values. The second input variable of the + * regression. + * @param statistic A vector of responses - the value of the statistic for + * each given inputSize. + */ DistanceStatisticPredictor(const arma::Col& inputSize, - const arma::vec& statistic, - size_t k) - : k(k) - { Train(inputSize, statistic); }; + const arma::Col& kValues, + const arma::mat& statistic) + { Train(inputSize, kValues, statistic); }; //! Default destructor. ~DistanceStatisticPredictor() { }; @@ -175,12 +191,16 @@ class LSHModel /** * Function that fits the alpha, beta and gamma parameters. * - * @param inputSize A vector of input sizes. The input variable of the + * @param inputSize A vector of input sizes. The first input variable of + * the regression. + * @param kValues A vector of k values. The second input variable of the * regression. * @param statistic A vector of responses - the value of the statistic for * each given inputSize. */ - void Train(const arma::Col& inputSize, const arma::vec& statistic); + double Train(const arma::Col& inputSize, + const arma::Col& kValues, + const arma::mat& statistic); /** * Evaluate the statistic for a given dataset size. @@ -188,7 +208,7 @@ class LSHModel * @param N - a new input size for which to evaluate the expected * statistic. */ - double Predict(size_t N) + double Predict(size_t N, size_t k) { return alpha * std::pow(k, beta) * std::pow(N, gamma); }; //! Set the alpha parameter. @@ -209,39 +229,36 @@ class LSHModel //! Get the gamma parameter. double Gamma(void) { return gamma; }; - //! Set the k parameter. - void K(double kIn) { k = kIn; }; - - //! Get the k parameter. - double K(void) { return k; }; private: double alpha; double beta; double gamma; - double k; }; - //! Vector of DistanceStatisticPredictors for arithmetic mean. - std::vector aMeanPredictors; + //! DistanceStatisticPredictor for arithmetic mean. + DistanceStatisticPredictor aMeanPredictor; - //! Vector of DistanceStatisticPredictors for geometric mean. - std::vector gMeanPredictors; + //! DistanceStatisticPredictor for geometric mean. + DistanceStatisticPredictor gMeanPredictor; /** - * Function that fits two DistanceStatisticPredictors for each k - one + * Function that fits two DistanceStatisticPredictors - one * to predict arithmetic mean and one to preduct geometric mean. * * @param referenceSizes The number of reference points for each kNN search. + * @param kValues The rank of the neighbors used for the statistic, for + * example k = 5 means Ek is the arithmetic mean of the 5th-nearest + * neighbor for different sample sizes. * @param Ek The arithmetic mean of the squared distances of a point and its * k-nearest neighbor. One column per k. * @param Gk The geometric mean of the squared distances of a point and its * k-nearest neighbor. One column per k. */ - void ApproximateKNNStatistics( - arma::Col referenceSizes, - arma::mat Ek, - arma::mat Gk); + void ApproximateKNNStatistics(const arma::Col& referenceSizes, + const arma::Col& kValues, + const arma::mat& Ek, + const arma::mat& Gk); //! Flag that tracks if we own the reference set. bool ownsSet; diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp index 1df2ac0c3ad..3fdc04a73bb 100644 --- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp @@ -19,8 +19,8 @@ namespace mlpack { namespace neighbor { // Constructor sets variables and trains the object. -template -LSHModel::LSHModel(const arma::mat &referenceSet, +template +LSHModel::LSHModel(const arma::mat &referenceSet, const double minRecall, const double sampleSize, const size_t k) @@ -33,16 +33,16 @@ LSHModel::LSHModel(const arma::mat &referenceSet, } // Destructor must de-allocate any referenceSet and LSHSearch objects we own. -template -LSHModel::~LSHModel() +template +LSHModel::~LSHModel() { if (ownsSet) delete referenceSet; }; // Trains the object. -template -void LSHModel::Train(const arma::mat &referenceSet, +template +void LSHModel::Train(const arma::mat &referenceSet, const double minRecall, const double sampleSize, const size_t k) @@ -106,6 +106,7 @@ void LSHModel::Train(const arma::mat &referenceSet, // Number of samples to create for modeling the Gamma Distributions size_t regressionExamples = 50; // TODO: parameter? + // Number of points to use as queries. size_t numAnchors = (size_t) std::round(0.1 * numSamples); arma::mat queryMat = sampleSet.cols(0, numAnchors - 1); @@ -140,9 +141,10 @@ void LSHModel::Train(const arma::mat &referenceSet, Timer::Stop("neighbors_distances"); // Step 5. Model the arithmetic and geometric mean according to the paper. - // This will produce 6 parameters (aE, bE, cE, aG, bG, cG) for each value of k - // from 1 to the k specified by the user. - ApproximateKNNStatistics(referenceSizes, Ek, Gk); + // This will produce 6 parameters (aE, bE, cE, aG, bG, cG). + // Vector of k values. + arma::Col kValues = arma::linspace>(1, k, k); + ApproximateKNNStatistics(referenceSizes, kValues, Ek, Gk); // Step 6. Fit Gamma distributions to pairwise distances and kNN distances, // generated or estimated in steps 3 and 5. @@ -151,38 +153,30 @@ void LSHModel::Train(const arma::mat &referenceSet, // keeping recall above minimum. } -// Fit two predictors for each k. -template -void LSHModel::ApproximateKNNStatistics( - arma::Col referenceSizes, - arma::mat Ek, - arma::mat Gk) +// Fit two predictors, one for arithmetic mean E and one for geometric mean G. +template +void LSHModel:: +ApproximateKNNStatistics(const arma::Col& referenceSizes, + const arma::Col& kValues, + const arma::mat& Ek, + const arma::mat& Gk) { - size_t k = Ek.n_cols; - - // Clear vectors and set them to correct size. - aMeanPredictors.clear(); - gMeanPredictors.clear(); - aMeanPredictors.resize(k); - gMeanPredictors.resize(k); - - // Fit two predictors per value of k. - for (size_t i = 0; i < k; ++i) - { - aMeanPredictors[i] = DistanceStatisticPredictor( - referenceSizes, Ek.col(i), i); - gMeanPredictors[i] = DistanceStatisticPredictor( - referenceSizes, Gk.col(i), i); - } + double aError = aMeanPredictor.Train(referenceSizes, kValues, Ek); + Log::Info << "L_BFGS Converged for arithmetic mean with error " + << aError << "." << std::endl; + double gError = gMeanPredictor.Train(referenceSizes, kValues, Gk); + Log::Info << "L_BFGS Converged for geometric mean with error " + << gError << "." << std::endl; } // Construct and return an LSH object. -template -LSHSearch* LSHModel::LSHObject(const size_t numProjIn, - const size_t numTablesIn, - const double hashWidthIn, - const size_t secondHashSize, - const size_t bucketSize) +template +LSHSearch* LSHModel:: +LSHObject(const size_t numProjIn, + const size_t numTablesIn, + const double hashWidthIn, + const size_t secondHashSize, + const size_t bucketSize) { // Values for the object to be created with (specified by user or default). size_t numProjOut = numProjIn; @@ -218,22 +212,34 @@ LSHSearch* LSHModel::LSHObject(const size_t numProjIn, } // Fit a curve to the data provided. -template -void LSHModel::DistanceStatisticPredictor::Train( +template +double LSHModel::DistanceStatisticPredictor::Train( const arma::Col& inputSize, - const arma::vec& statistic) + const arma::Col& kValues, + const arma::mat& statistic) { - Log::Warn << "Not implemented yet! " << std::endl; + // Objective function for fitting the E(x, k) curve to the statistic. + ObjectiveFunction f(inputSize, kValues, statistic); + + // Optimizer. Use L_BFGS (TODO: Make this a template parameter?) + mlpack::optimization::L_BFGS opt(f); + + // Get an initial point from the optimizer. + arma::mat currentPoint = f.GetInitialPoint(); + double result = opt.Optimize(currentPoint); + + // Optimizer is done - set alpha, beta, gamma. + this->alpha = currentPoint(0, 0); + this->beta = currentPoint(1, 0); + this->gamma = currentPoint(2, 0); - alpha = beta = gamma = 1; - beta++; - alpha+=2; + return result; } // Serialize the object and save to a file. -template +template template -void LSHModel::Serialize(Archive& ar) +void LSHModel::Serialize(Archive& ar) { //TODO: implement this. } diff --git a/src/mlpack/methods/lsh_model/objectivefunction.hpp b/src/mlpack/methods/lsh_model/objectivefunction.hpp new file mode 100644 index 00000000000..2aa60ab8054 --- /dev/null +++ b/src/mlpack/methods/lsh_model/objectivefunction.hpp @@ -0,0 +1,171 @@ +/** + * @file objectivefunction.hpp + * @author Yannis Mentekidis + * + * This file implements a class that describes an objective function for + * minimization. It is used by the LSH model to fit a curve of the form + * E(k, N) = \alpha \cdot k ^ \beta \cdot N^\gamma + * to a certain statistic E, which can be either the arithmetic or the geometric + * mean of distances of a random point and its k-Nearest Neighbors. + * + * The objective function to minimize is the mean squared error (MSE): + * + * Error =\sum_{i=0}^{M} (y(i) - \alpha \cdot k ^ \beta \cdot N^\gamma)^2 + * + * The class is designed for use with the L_BFGS optimizer, which is what the + * lshmodel class uses. + */ + +#ifndef MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_DEFAULT_OBJECTIVE_FUNCTION_HPP +#define MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_DEFAULT_OBJECTIVE_FUNCTION_HPP + +namespace mlpack { +namespace neighbor { + +class DefaultObjectiveFunction +{ + public: + //! Default constructor - do nothing. + DefaultObjectiveFunction() { }; + + /** + * Parameterized constructor. + * + * @param xData Vector of x - the sizes of the reference set when performing + * kNN. + * @param kData Vector of k - the kth nearest neighbor for which we + * calculated the statistic. + * @param yData Matrix of y, one for each (x, k) value. + */ + DefaultObjectiveFunction(const arma::Col& xData, + const arma::Col& kData, + const arma::mat& yData) + : xData(&xData), kData(&kData), yData(&yData) { }; + + //! Return the number of functions + size_t NumFunctions(void) const { return yData->n_elem; } + + //! Return a random starting point. + arma::mat GetInitialPoint() const + { return arma::mat(3, 1, arma::fill::randu); } + + /** + * This function evaluates the objective (MSE) at some coordinates with + * some index. + * Called by the optimizer. + * + * @param coordinates Input matrix of coordinates. + */ + double Evaluate(const arma::mat& coordinates) const; + + /** + * This function evaluates the gradient at some coordinates with some index. + * Called by the optimizer. + * + * @param coordinates Input matrix of coordinates. + * @param gradient Output matrix of gradients for each dimension of the + * surface + */ + void Gradient(const arma::mat& coordinates, + arma::mat& gradient) const; + + private: + //! Data points for x-axis. + const arma::Col* xData; + //! Data points for k-axis. + const arma::Col* kData; + //! Data points for y-axis. + const arma::mat* yData; +}; + +/** + * Returns the value of the objective function for some coordinates (alpha, + * beta, gamma). + * This is the mean squared error for the current parameters or coordinates. + */ +double DefaultObjectiveFunction::Evaluate(const arma::mat& coordinates) const +{ + // Use extra variables to make code readable. + double alpha = coordinates(0, 0); + double beta = coordinates(1, 0); + double gamma = coordinates(2, 0); + double M = (double) NumFunctions(); + + // Sum the squared error for each element in yData. + double sum = 0; + for (size_t i = 0; i < yData->n_elem; ++i) + { + // Map i to (row, col). Columnwise access of yData. + size_t row = i % yData->n_rows; + size_t col = (size_t) (i / yData->n_rows); // Integer division (floor). + + // Get the corresponding values. + size_t x = (*xData)(row); + size_t k = (*kData)(col); + double y = (*yData)(row, col); + + // Evaluate (y - a * k ^ b * x ^ c)^2 for the given (x, y) pair. + sum += pow(y - alpha * std::pow(k, beta) * std::pow(x, gamma), 2); + } + + // Return the mean of the squared errors. + return sum / M; +} + +/** + * Stores the gradient of the objective function in gradient. This is the + * derivative with respect to (alpha, beta, gamma) evaluated at the current + * parameters. + */ +void DefaultObjectiveFunction::Gradient(const arma::mat& coordinates, + arma::mat& gradient) const +{ + // Use extra variables to make code readable. + double alpha = coordinates(0, 0); + double beta = coordinates(1, 0); + double gamma = coordinates(2, 0); + double M = (double) NumFunctions(); + + // Allocate 3x1 matrix for gradient. Set all gradients to 0. + gradient.set_size(3, 1); + gradient.zeros(3,1); + + // Sum each gradient. + for (size_t i = 0; i < yData->n_elem; ++i) + { + size_t row = i % yData->n_rows; + size_t col = (size_t) (i / yData->n_rows); // Integer division. + size_t x = (*xData)(row); + size_t k = (*kData)(col); + double y = (*yData)(row, col); + + // The error for these parameters. Precompute for efficiency. + double error = (y - alpha * std::pow(k, beta) * std::pow(x, gamma)); + + // The chain rule factor of the product, for each gradient dimension. + double alphaChain = + - 2.0 * std::pow(k, beta) * std::pow(x, gamma); + + double betaChain = + - 2.0 * alpha * std::pow(x, gamma) * std::log(k) * std::pow(k, beta); + + double gammaChain = + - 2.0 * alpha * std::pow(k, beta) * std::log(x) * std::pow(x, gamma); + + // 3x1 column vector (in matrix form). + gradient(0, 0) += error * alphaChain; + gradient(1, 0) += error * betaChain; + gradient(2, 0) += error * gammaChain; + } + + // Return the average of each gradient after the summation is complete. + gradient(0, 0) /= ((double) M); + gradient(1, 0) /= ((double) M); + gradient(2, 0) /= ((double) M); +} + +} // namespace neighbor +} // namespace mlpack + +#endif + From 29b6eaadfca44102f23edd889012f0ef57dbd762 Mon Sep 17 00:00:00 2001 From: Yannis Mentekidis Date: Thu, 4 Aug 2016 17:22:32 +0100 Subject: [PATCH 04/18] Adds code that returns trained LSH objects --- src/mlpack/methods/lsh_model/lshmodel.hpp | 6 +++--- src/mlpack/methods/lsh_model/lshmodel_impl.hpp | 18 ++++-------------- src/mlpack/methods/lsh_model/lshmodel_main.cpp | 10 ++++++++-- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp index 55b4ae03479..e0d8299317c 100644 --- a/src/mlpack/methods/lsh_model/lshmodel.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel.hpp @@ -130,7 +130,7 @@ class LSHModel * @param secondHashSize The second level hash width. * @param bucketSize The second level bucket size. */ - LSHSearch* LSHObject( + LSHSearch LSHObject( const size_t numProjIn = 0, const size_t numTablesIn = 0, const double hashWidthIn = 0.0, @@ -281,8 +281,8 @@ class LSHModel //! Reference dataset. const arma::mat* referenceSet; - //! Vector of LSHSearch objects. - std::vector< LSHSearch > lshObjectVector; + //! LSHSearch Object Vector. + std::vector> lshObjectVector; //! Statistic: average squared distance of points. double meanDist; diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp index 3fdc04a73bb..3d21843cabb 100644 --- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp @@ -171,7 +171,7 @@ ApproximateKNNStatistics(const arma::Col& referenceSizes, // Construct and return an LSH object. template -LSHSearch* LSHModel:: +LSHSearch LSHModel:: LSHObject(const size_t numProjIn, const size_t numTablesIn, const double hashWidthIn, @@ -193,22 +193,12 @@ LSHObject(const size_t numProjIn, if (hashWidthOut == 0.0) hashWidthOut = this->hashWidth; - std::cout << *referenceSet; - - //TODO This causes a bad_alloc... I'm doing something wrong with the - //referenceSet. - /* - // Construct an object and return it. - LSHSearch<>* lshObject = new LSHSearch<>( - *referenceSet, numProjOut, numTablesOut, hashWidthOut, + LSHSearch<> lsh(*referenceSet, numProjOut, numTablesOut, hashWidthOut, secondHashSize, bucketSize); - return lshObject; - */ - - LSHSearch<>* lshObject = new LSHSearch<>(); - return lshObject; + lshObjectVector.push_back(lsh); + return lshObjectVector[lshObjectVector.size() - 1]; } // Fit a curve to the data provided. diff --git a/src/mlpack/methods/lsh_model/lshmodel_main.cpp b/src/mlpack/methods/lsh_model/lshmodel_main.cpp index d5253a9a7e9..7192ce8ff96 100644 --- a/src/mlpack/methods/lsh_model/lshmodel_main.cpp +++ b/src/mlpack/methods/lsh_model/lshmodel_main.cpp @@ -18,8 +18,14 @@ int main(int argc, char* argv[]) // Generate a random point set. size_t N = 5000; size_t d = 10; - arma::mat A(d, N, arma::fill::randu); - LSHModel<> model(A, 0.7, 0.25, 2); + arma::mat rdata(d, N, arma::fill::randu); + LSHModel<> model(rdata, 0.7, 0.25, 2); + + arma::mat qdata(d, 1, arma::fill::randu); + arma::Mat neighbors; + arma::mat distances; + LSHSearch<> lsh = model.LSHObject(1, 1, 1.0, 99901, 500); + lsh.Search(qdata, 1, neighbors, distances); return 0; } From dd8f5f37263ac83414128599ded4a2939439317c Mon Sep 17 00:00:00 2001 From: Yannis Mentekidis Date: Fri, 5 Aug 2016 11:14:12 +0100 Subject: [PATCH 05/18] attempt to fix arma::shuffle call causes travis build failure. --- src/mlpack/methods/lsh_model/lshmodel_impl.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp index 3d21843cabb..b400d1cc228 100644 --- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp @@ -63,13 +63,12 @@ void LSHModel::Train(const arma::mat &referenceSe // sample. arma::vec sampleHelper(referenceSet.n_cols, arma::fill::randu); - // Keep a sample of the dataset. Shuffle to be impartial (in case reference - // set is sorted). - arma::mat sampleSet = arma::shuffle(referenceSet.cols( - // We have uniformly random numbers in [0, 1], so we expect about - // N*sampleSize of them to be in [0, sampleSize). - arma::find(sampleHelper < sampleSize) - )); + // Keep a sample of the dataset: We have uniformly random numbers in [0, 1], + // so we expect about N*sampleSize of them to be in [0, sampleSize). + arma::mat sampleSet = referenceSet.cols( + arma::find(sampleHelper < sampleSize)); + // Shuffle to be impartial (in case dataset is sorted in some way). + sampleSet = arma::shuffle(sampleSet); const size_t numSamples = sampleSet.n_cols; // Points in sampled set. Log::Info << "Sampled " << numSamples << " points to train with." << std::endl; From 009a4acf3ee417dbae2eb2de4ea91a3caa696818 Mon Sep 17 00:00:00 2001 From: Yannis Mentekidis Date: Fri, 5 Aug 2016 15:22:07 +0100 Subject: [PATCH 06/18] Completes Train(), adds Predict() --- src/mlpack/methods/lsh_model/lshmodel.hpp | 101 +++++++++-------- .../methods/lsh_model/lshmodel_impl.hpp | 103 +++++++++++++----- .../methods/lsh_model/lshmodel_main.cpp | 6 +- 3 files changed, 137 insertions(+), 73 deletions(-) diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp index e0d8299317c..2890b0a91ce 100644 --- a/src/mlpack/methods/lsh_model/lshmodel.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel.hpp @@ -49,6 +49,8 @@ #include // Default objective function. #include "objectivefunction.hpp" +// Gamma distribution for modeling squared distances. +#include namespace mlpack { namespace neighbor { @@ -62,7 +64,7 @@ class LSHModel public: //! Empty Constructor. Do nothing - LSHModel(){ /* Do nothing. */ }; + LSHModel(){ referenceSet = NULL; }; /** Parameterized Constructor. This function initializes the object and * trains it with the provided reference set. @@ -70,9 +72,6 @@ class LSHModel * @param referenceSet The data that will be used as a reference set for LSH * to run queries against. We will fit distributions based on this data * and produce good parameters for it. - * @param minRecall The minimum recall we want to guarantee. The parameters - * we will estimate will try to keep average recall of LSH above this. - * Must be in [0, 1). * @param sampleSize The percentage of the reference set to sample for the * estimation. Naive all-kNN will be run on this sample, so if it is too * big, training will be very slow. Must be in [0, 1) @@ -80,7 +79,6 @@ class LSHModel */ LSHModel( const arma::mat &referenceSet, - const double minRecall, const double sampleSize, const size_t k); @@ -88,34 +86,44 @@ class LSHModel ~LSHModel(); /** - * Trains the LSHModel. Fits distributions using referenceSet and then looks - * for LSH parameters that would return recalls larger than minRecall in the - * lowest cost (selectivity) possible. + * Trains the LSHModel. Train() uses a sample that is sampleRate * |N| to + * estimate parameters of the dataset. The estimated parameters are: + * * Arithmetic mean of pairwise distances of random points in the sample. + * * Geometric mean for the pairwise distnaces + * * Arithmetic mean of distance random point to its k-th nearest neighbor + * as a function of |N|, the number of points. + * * Geometric mean of the same distance. * - * The model can estimate good values for the parameters: - * * numProj: Number of projections per projection table. - * * numTables: Number of projection tables. - * * hashWidth: Hash width of the LSH hash. - * * numProbes: Number of probes for multiprobe LSH. - * - * Train stores the computed parameters in the LSHModel object's variables. + * Train() does not find LSH Parameters - it only estimates the dataset + * parameters. You have to call Predict() to find LSH Parameters. * * @param referenceSet The data that will be used as a reference set for LSH * to run queries against. We will fit distributions based on this data * and produce good parameters for it. - * @param minRecall The minimum recall we want to guarantee. The parameters - * we will estimate will try to keep average recall of LSH above this. - * Must be in [0, 1). - * @param sampleSize The percentage of the reference set to sample for the + * @param sampleRate The percentage of the reference set to sample for the * estimation. Naive all-kNN will be run on this sample, so if it is too * big, training will be very slow. Must be in [0, 1) - * @param k The number of nearest neighbors wanted for each query. + * @param maxKValue The maximum number of nearest neighbors for each query to + * train for. */ - void Train( - const arma::mat &referenceSet, - const double minRecall, - const double sampleSize, - const size_t k); + void Train(const arma::mat& referenceSet, + const double sampleRate = 0.1, + const size_t maxKValue = 32); + + /** + * Predict() finds LSH parameters that should work well for the dataset the + * LSHModel was trained for. + * Warning: If the k specified is larger than the maxKValue passed to + * Train(), Train() will be called again. This might have adverse effects to + * performance. + * + * @param datasetSize The size of the dataset that will be used. + * @param k The number of k-nearest neighbors LSH must find. + * @param minRecall The minimum acceptable recall we want to tune for. + */ + void Predict(const size_t datasetSize, + const size_t k, + const double minRecall); /** * This function returns an LSHSearch object trained with the parameters @@ -157,6 +165,23 @@ class LSHModel void Serialize(Archive& ar); private: + /** + * Function that fits two DistanceStatisticPredictors - one + * to predict arithmetic mean and one to preduct geometric mean. + * + * @param referenceSizes The number of reference points for each kNN search. + * @param kValues The rank of the neighbors used for the statistic, for + * example k = 5 means Ek is the arithmetic mean of the 5th-nearest + * neighbor for different sample sizes. + * @param Ek The arithmetic mean of the squared distances of a point and its + * k-nearest neighbor. One column per k. + * @param Gk The geometric mean of the squared distances of a point and its + * k-nearest neighbor. One column per k. + */ + void ApproximateKNNStatistics(const arma::Col& referenceSizes, + const arma::Col& kValues, + const arma::mat& Ek, + const arma::mat& Gk); /** * This is a helper class that uses the function a * k^b * N^c for some @@ -229,7 +254,6 @@ class LSHModel //! Get the gamma parameter. double Gamma(void) { return gamma; }; - private: double alpha; double beta; @@ -241,30 +265,15 @@ class LSHModel //! DistanceStatisticPredictor for geometric mean. DistanceStatisticPredictor gMeanPredictor; - - /** - * Function that fits two DistanceStatisticPredictors - one - * to predict arithmetic mean and one to preduct geometric mean. - * - * @param referenceSizes The number of reference points for each kNN search. - * @param kValues The rank of the neighbors used for the statistic, for - * example k = 5 means Ek is the arithmetic mean of the 5th-nearest - * neighbor for different sample sizes. - * @param Ek The arithmetic mean of the squared distances of a point and its - * k-nearest neighbor. One column per k. - * @param Gk The geometric mean of the squared distances of a point and its - * k-nearest neighbor. One column per k. - */ - void ApproximateKNNStatistics(const arma::Col& referenceSizes, - const arma::Col& kValues, - const arma::mat& Ek, - const arma::mat& Gk); + + //! (k+1)-dimensional gamma distribution for predicting squared distances. + mlpack::distribution::GammaDistribution distancesDistribution; //! Flag that tracks if we own the reference set. bool ownsSet; - //! Flag that tracks if we own an LSHSearch object. - bool ownsLSHObject; + //! Maximum k value the object is trained for. + size_t maxKValue; //! Number of projections per table. size_t numProj; diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp index b400d1cc228..50bf1869e12 100644 --- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp @@ -20,16 +20,16 @@ namespace neighbor { // Constructor sets variables and trains the object. template -LSHModel::LSHModel(const arma::mat &referenceSet, - const double minRecall, - const double sampleSize, - const size_t k) +LSHModel:: +LSHModel(const arma::mat &referenceSet, + const double sampleSize, + const size_t k) { // We don't own the set - we just point to it. ownsSet = false; this->referenceSet = &referenceSet; - Train(referenceSet, minRecall, sampleSize, k); + Train(referenceSet, sampleSize, k); } // Destructor must de-allocate any referenceSet and LSHSearch objects we own. @@ -42,36 +42,37 @@ LSHModel::~LSHModel() // Trains the object. template -void LSHModel::Train(const arma::mat &referenceSet, - const double minRecall, - const double sampleSize, - const size_t k) +void LSHModel::Train( + const arma::mat &referenceSet, + const double sampleRate, + const size_t k) { - // TODO: Implement - - // Sanity Check: Verify that recall and sampleSize are in [0, 1). - if (minRecall >= 1 || minRecall < 0) - throw std::runtime_error("Recall must be floating point number in [0, 1)"); - - if (sampleSize > 1 || sampleSize <= 0) + // Sanity check - sample rate must be in (0, 1]. + if (sampleRate > 1 || sampleRate <= 0) throw std::runtime_error( "Sampling rate must be floating point number in (0, 1]"); - const size_t numPoints = referenceSet.n_cols; // Points in original set. + // Update the object's max K value information. + maxKValue = k; + + // Save pointer to training set. + this->referenceSet = &referenceSet; // Step 1. Select a random sample of the dataset. We will work with only that // sample. + arma::vec sampleHelper(referenceSet.n_cols, arma::fill::randu); // Keep a sample of the dataset: We have uniformly random numbers in [0, 1], - // so we expect about N*sampleSize of them to be in [0, sampleSize). + // so we expect about N*sampleRate of them to be in [0, sampleRate). arma::mat sampleSet = referenceSet.cols( - arma::find(sampleHelper < sampleSize)); + arma::find(sampleHelper < sampleRate)); // Shuffle to be impartial (in case dataset is sorted in some way). sampleSet = arma::shuffle(sampleSet); const size_t numSamples = sampleSet.n_cols; // Points in sampled set. - Log::Info << "Sampled " << numSamples << " points to train with." << std::endl; + Log::Info << "Training model with " << numSamples << " points in sample set." + << std::endl; // Step 2. Compute all-vs-all distances of points in the sample. // The distance matrix is symmetric, so we only compute elements above the @@ -89,9 +90,9 @@ void LSHModel::Train(const arma::mat &referenceSe // Step 3. Estimate statistics of these distances: log(mean(d)), mean(log(d)), // mean(d). distances = arma::pow(distances, 2); - meanDist = arma::mean(distances); - logMeanDist = std::log(meanDist); - meanLogDist = arma::mean(arma::log(distances)); + this->meanDist = arma::mean(distances); + this->logMeanDist = std::log(meanDist); + this->meanLogDist = arma::mean(arma::log(distances)); // Step 4. Select a small part of the sample as 'anchor points'. Use the rest // of the sample as the reference set. Find the k-Nearest Neighbors' distances @@ -101,7 +102,7 @@ void LSHModel::Train(const arma::mat &referenceSe // The geometric mean of N numbers is the Nth root of the product of the // numbers. Through logarithmic properties though, this becomes computable // through exponentiating the mean of the logarithms of x: - // mean(log(x)) = geometricmean(x). + // exp(mean(log(x))) = geometricmean(x). // Number of samples to create for modeling the Gamma Distributions size_t regressionExamples = 50; // TODO: parameter? @@ -119,8 +120,8 @@ void LSHModel::Train(const arma::mat &referenceSe arma::mat Ek(regressionExamples, k); arma::mat Gk(regressionExamples, k); - Timer::Start("neighbors_distances"); // For each referenceSize, calculate the kNN of the anchors + Log::Info.ignoreInput = true; // Ignore kNN output. for (size_t i = 0; i < regressionExamples; ++i) { // TODO: Since we've already computed this, avoid calling kNN? @@ -137,19 +138,69 @@ void LSHModel::Train(const arma::mat &referenceSe Ek.row(i) = arma::mean(kNNDistances.t()); Gk.row(i) = arma::exp(arma::mean(arma::log(kNNDistances.t()), 0)); } - Timer::Stop("neighbors_distances"); + Log::Info.ignoreInput = false; // Keep giving normal output. // Step 5. Model the arithmetic and geometric mean according to the paper. // This will produce 6 parameters (aE, bE, cE, aG, bG, cG). // Vector of k values. + Timer::Start("neighbor_statistic_regression"); arma::Col kValues = arma::linspace>(1, k, k); ApproximateKNNStatistics(referenceSizes, kValues, Ek, Gk); + Timer::Stop("neighbor_statistic_regression"); +} + +// Predict parameters for LSH that will have acceptable recall. +template +void LSHModel::Predict(const size_t datasetSize, + const size_t k, + const double minRecall) +{ + // Sanity check. Recall can't be greater/equal to 1, or negative. + if (minRecall < 0 || minRecall >=1) + throw std::runtime_error("minRecall must be in [0, 1)"); + + // If the object wasn't trained, die here. + if (referenceSet == NULL) + Log::Fatal << "Attempt to use Predict() on untrained Object. Exiting." + << std::endl; + + // Before proceeding, if requested K is larger than the k we trained with, + // re-train the object. + if (k > maxKValue) + { + + // Otherwise, warn the user of the re-training and re-train. + Log::Warn << "Larger k requested; Re-training the LSHModel " + "with default sampling rate and new k." << std::endl; + Train(*referenceSet, 0.1, k); // Default sampling rate. + } + // Steps 1 - 5 happen in Train(). // Step 6. Fit Gamma distributions to pairwise distances and kNN distances, // generated or estimated in steps 3 and 5. + // Gamma distribution for pairwise distances. + arma::vec logMeanVec(k + 1), meanLogVec(k + 1), meanVec(k + 1); + // Statistics were computed in Train() + meanVec(0) = this->meanDist; + logMeanVec(0) = this->logMeanDist; + meanLogVec(0) = this->meanLogDist; + // Train gamma and put in gammaDists[0]. + + Timer::Start("fitting_distributions"); + for (size_t i = 1; i <= k; ++i) + { + meanVec(i) = aMeanPredictor.Predict(datasetSize, k); + logMeanVec(i) = std::log(meanVec(i)); + // log(geometricMean) = \frac{1}{n} \sum(lnx_i) = mean(lnx) = meanLog + meanLogVec(i) = std::log(gMeanPredictor.Predict(datasetSize, k)); + } + // Fit the distribution. + distancesDistribution.Train(logMeanVec, meanLogVec, meanVec); + Timer::Stop("fitting_distributions"); // Step 7. Run Binary search on parameter space to minimize selectivity while // keeping recall above minimum. + } // Fit two predictors, one for arithmetic mean E and one for geometric mean G. diff --git a/src/mlpack/methods/lsh_model/lshmodel_main.cpp b/src/mlpack/methods/lsh_model/lshmodel_main.cpp index 7192ce8ff96..00cd2d03e57 100644 --- a/src/mlpack/methods/lsh_model/lshmodel_main.cpp +++ b/src/mlpack/methods/lsh_model/lshmodel_main.cpp @@ -18,8 +18,12 @@ int main(int argc, char* argv[]) // Generate a random point set. size_t N = 5000; size_t d = 10; + size_t k = 5; + double sampleSize = 0.25; + double minRecall = 0.4; arma::mat rdata(d, N, arma::fill::randu); - LSHModel<> model(rdata, 0.7, 0.25, 2); + LSHModel<> model(rdata, sampleSize, k); + model.Predict(N, k, minRecall); arma::mat qdata(d, 1, arma::fill::randu); arma::Mat neighbors; From cdcb575826bfb3bd0ef4cafacf465435b3d6d144 Mon Sep 17 00:00:00 2001 From: mentekid Date: Fri, 12 Aug 2016 14:09:12 +0100 Subject: [PATCH 07/18] Adds Perturbation Sequence Generation (needs bugfixing) --- src/mlpack/methods/lsh_model/lshmodel.hpp | 59 +++++ .../methods/lsh_model/lshmodel_impl.hpp | 228 +++++++++++++++++- 2 files changed, 281 insertions(+), 6 deletions(-) diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp index 2890b0a91ce..ca146f4d503 100644 --- a/src/mlpack/methods/lsh_model/lshmodel.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel.hpp @@ -165,6 +165,65 @@ class LSHModel void Serialize(Archive& ar); private: + + /** + * Returns the score of a perturbation vector generated by perturbation set A. + * The score of a pertubation set (vector) is the sum of scores of the + * participating actions. + * @param A perturbation set to compute the score of. + * @param scores vector containing score of each perturbation. + */ + double PerturbationScore(const std::vector& A, + const arma::vec& scores) const; + /** + * Inline function used by GetAdditionalProbingBins. The vector shift operation + * replaces the largest element of a vector A with (largest element) + 1. + * Returns true if resulting vector is valid, otherwise false. + * @param A perturbation set to shift. + */ + bool PerturbationShift(std::vector& A) const; + + /** + * Inline function used by GetAdditionalProbingBins. The vector expansion + * operation adds the element [1 + (largest_element)] to a vector A, where + * largest_element is the largest element of A. Returns true if resulting vector + * is valid, otherwise false. + * @param A perturbation set to expand. + */ + bool PerturbationExpand(std::vector& A) const; + + /** + * Return true if perturbation set A is valid. A perturbation set is invalid if + * it contains two (or more) actions for the same dimension or dimensions that + * are larger than the queryCode's dimensions. + * @param A perturbation set to validate. + * @param numProj The number of projections for the sequence under validation. + */ + bool PerturbationValid(const std::vector& A, size_t numProj) const; + /** + * Function that creates a template perturbation sequence given a value for + * an M and a W. The template perturbation sequence is based on the + * statistical properties of multi-probe LSH and uses those, instead of + * specific points, to generate scores. + * See mlpack/methods/lsh/lsh_search_impl.hpp for more details about how + * perturbation sequences are generated from specific points. + * + * @param numProj The number of projections for the LSH scheme for which we + * want to compute the template perturbation sequence. + * @param hashWidth The hash width for the LSH scheme. + * @param numProbes The number of probes to generate. + */ + void GenerateTemplateSequence(size_t numProj, + double hashWidth, + size_t numProbes); + + /** Matrix that stores, in each column, the "direction" of the perturbation: + * 0 means no perturbation on that dimension, -1 means reduce dimension value + * by 1, and +1 means increase dimension value by 1. + */ + + arma::Mat templateSequence; + /** * Function that fits two DistanceStatisticPredictors - one * to predict arithmetic mean and one to preduct geometric mean. diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp index 50bf1869e12..b2de464eb6c 100644 --- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp @@ -49,8 +49,8 @@ void LSHModel::Train( { // Sanity check - sample rate must be in (0, 1]. if (sampleRate > 1 || sampleRate <= 0) - throw std::runtime_error( - "Sampling rate must be floating point number in (0, 1]"); + Log::Fatal << "Sampling rate must be floating point number in (0, 1]" + << std::endl; // Update the object's max K value information. maxKValue = k; @@ -60,7 +60,6 @@ void LSHModel::Train( // Step 1. Select a random sample of the dataset. We will work with only that // sample. - arma::vec sampleHelper(referenceSet.n_cols, arma::fill::randu); // Keep a sample of the dataset: We have uniformly random numbers in [0, 1], @@ -125,8 +124,10 @@ void LSHModel::Train( for (size_t i = 0; i < regressionExamples; ++i) { // TODO: Since we've already computed this, avoid calling kNN? + // Reference set for kNN - arma::mat refMat = sampleSet.cols(numAnchors, numAnchors + referenceSizes(i) ); + arma::mat refMat = sampleSet.cols(numAnchors, + numAnchors + referenceSizes(i)); arma::Mat neighbors; // Not going to be used but required. arma::mat kNNDistances; // What we need. @@ -149,6 +150,7 @@ void LSHModel::Train( Timer::Stop("neighbor_statistic_regression"); } + // Predict parameters for LSH that will have acceptable recall. template void LSHModel::Predict(const size_t datasetSize, @@ -157,7 +159,7 @@ void LSHModel::Predict(const size_t datasetSize, { // Sanity check. Recall can't be greater/equal to 1, or negative. if (minRecall < 0 || minRecall >=1) - throw std::runtime_error("minRecall must be in [0, 1)"); + Log::Fatal << "Parameter minRecall must be in [0, 1)" << std::endl; // If the object wasn't trained, die here. if (referenceSet == NULL) @@ -174,7 +176,8 @@ void LSHModel::Predict(const size_t datasetSize, "with default sampling rate and new k." << std::endl; Train(*referenceSet, 0.1, k); // Default sampling rate. } - // Steps 1 - 5 happen in Train(). + + // Note: Steps 1 - 5 happen in Train(). // Step 6. Fit Gamma distributions to pairwise distances and kNN distances, // generated or estimated in steps 3 and 5. @@ -189,6 +192,8 @@ void LSHModel::Predict(const size_t datasetSize, Timer::Start("fitting_distributions"); for (size_t i = 1; i <= k; ++i) { + // Use the arithmetic and geometric mean predictors that were trained in + // Train() to estimate the statistics for the given datasetSize and k. meanVec(i) = aMeanPredictor.Predict(datasetSize, k); logMeanVec(i) = std::log(meanVec(i)); // log(geometricMean) = \frac{1}{n} \sum(lnx_i) = mean(lnx) = meanLog @@ -198,6 +203,9 @@ void LSHModel::Predict(const size_t datasetSize, distancesDistribution.Train(logMeanVec, meanLogVec, meanVec); Timer::Stop("fitting_distributions"); + // See if works + //GenerateTemplateSequence(3, 0.5, 8); + // Step 7. Run Binary search on parameter space to minimize selectivity while // keeping recall above minimum. @@ -251,6 +259,211 @@ LSHObject(const size_t numProjIn, return lshObjectVector[lshObjectVector.size() - 1]; } +// Helper function to generate perturbations. +template +inline force_inline +double LSHModel::PerturbationScore( + const std::vector& A, + const arma::vec& scores) const +{ + double score = 0.0; + for (size_t i = 0; i < A.size(); ++i) + if (A[i]) + score += scores(i); // add scores of non-zero indices + return score; +} + +// Helper function to generate perturbations. +template +inline force_inline +bool LSHModel::PerturbationShift( + std::vector& A) const +{ + size_t maxPos = 0; + for (size_t i = 0; i < A.size(); ++i) + if (A[i] == 1) // Marked true. + maxPos = i; + + if (maxPos + 1 < A.size()) // Otherwise, this is an invalid vector. + { + A[maxPos] = 0; + A[maxPos + 1] = 1; + return true; // valid + } + return false; // invalid +} + +// Helper function to generate perturbations. +template +inline force_inline +bool LSHModel::PerturbationExpand( + std::vector& A) const +{ + // Find the last '1' in A. + size_t maxPos = 0; + for (size_t i = 0; i < A.size(); ++i) + if (A[i]) // Marked true. + maxPos = i; + + if (maxPos + 1 < A.size()) // Otherwise, this is an invalid vector. + { + A[maxPos + 1] = 1; + return true; + } + return false; +} + +// Helper function to generate perturbations. +template +inline force_inline +bool LSHModel::PerturbationValid( + const std::vector& A, + size_t numProj) const +{ + // Use check to mark dimensions we have seen before in A. If a dimension is + // seen twice (or more), A is not a valid perturbation. + std::vector check(numProj); + + if (A.size() > 2 * numProj) + return false; // This should never happen. + + // Check that we only see each dimension once. If not, vector is not valid. + for (size_t i = 0; i < A.size(); ++i) + { + // Only check dimensions that were included. + if (!A[i]) + continue; + + // If dimesnion is unseen thus far, mark it as seen. + if (check[i % numProj] == false) + check[i % numProj] = true; + else + return false; // If dimension was seen before, set is not valid. + } + // If we didn't fail, set is valid. + return true; +} + +// Generate a probing sequence for a given M, W and T. +template +void LSHModel::GenerateTemplateSequence( + size_t numProj, + double hashWidth, + size_t numProbes) +{ + // If no additional probes requested, stop here. + if (numProbes == 0) + return; + + // If number of additional probes exceeds possible, set to max possible. + if (numProbes > ((1 << numProj) - 1)) + numProbes = (1 << numProj) - 1; + + // Calculate the expected scores based on Multi-probe LSH paper. + arma::vec scores(2 * numProj); + double M = (double) numProj; // To avoid integer division headache. + // "Positive" scores. + for (size_t j = 0; j < numProj; ++j) + scores(j) = pow(hashWidth, 2) * (j + 1 * (j + 2))/(4 * (M + 1) * (M + 2)); + // "Negative" scores. + for (size_t j = numProj; j < 2 * numProj; ++j) + scores(j) = pow(hashWidth, 2) * + (1 - + (2 * M + 1 - (j + 1))/(M + 1) + + ((2 * M + 1 - (j + 1)) * (2 * M + 2 - (j + 1)))/(4 * (M + 1) * (M + 2))); + cout << scores << endl; + + // A "+1" signifies a positive perturbation, a "-1" a negative one. + arma::Col actions(2 * numProj); // will be [1 ... -1 ...] + actions.rows(0, numProj - 1) = // First numProj rows. + arma::ones< arma::Col > (numProj); // 1s + actions.rows(numProj, (2 * numProj) - 1) = // Last numProj rows. + -1 * arma::ones< arma::Col > (numProj); // -1s + + // The "acting dimension", or which of the numProj dimension to increase or + // reduce according to the "actions". + arma::Col positions(2 * numProj); // Will be [0 1 2 ... 0 1 2 ...]. + positions.rows(0, numProj - 1) = + arma::linspace< arma::Col >(0, numProj - 1, numProj); + positions.rows(numProj, 2 * numProj - 1) = + arma::linspace< arma::Col >(0, numProj - 1, numProj); + + // Sort all three vectors so smaller scoring perturbations are first. + arma::uvec sortidx = arma::sort_index(scores); + scores = scores(sortidx); + actions = actions(sortidx); + positions = positions(sortidx); + + // From LSHSearch::GetAdditionalProbingBins. TODO: Modularize? + + // Perturbation sets (A) mark with 1 the (score, action, dimension) positions + // included in a given perturbation vector. Other spaces are 0. + std::vector Ao(2 * numProj); + Ao[0] = 1; // Smallest vector includes only smallest score. + + std::vector< std::vector > perturbationSets; + perturbationSets.push_back(Ao); // Storage of perturbation sets. + + std::priority_queue< + std::pair, // contents: pairs of (score, index) + std::vector< // container: vector of pairs + std::pair + >, + std::greater< std::pair > // comparator of pairs + > minHeap; // our minheap + + // Start by adding the lowest scoring set to the minheap. + minHeap.push( std::make_pair(PerturbationScore(Ao, scores), 0) ); + + // Loop invariable: after pvec iterations, additionalProbingBins contains pvec + // valid codes of the lowest-scoring bins (bins most likely to contain + // neighbors of the query). + + // Allocate 1 column per perturbed "code". + this->templateSequence.zeros(numProj, numProbes); + for (size_t pvec = 0; pvec < numProbes; ++pvec) + { + std::vector Ai; + do + { + // Get the perturbation set corresponding to the minimum score. + Ai = perturbationSets[ minHeap.top().second ]; + minHeap.pop(); // .top() returns, .pop() removes + + // Shift operation on Ai (replace max with max+1). + std::vector As = Ai; + if (PerturbationShift(As) && PerturbationValid(As, numProj)) + // Don't add invalid sets. + { + perturbationSets.push_back(As); // add shifted set to sets + minHeap.push( + std::make_pair(PerturbationScore(As, scores), + perturbationSets.size() - 1)); + } + + // Expand operation on Ai (add max+1 to set). + std::vector Ae = Ai; + if (PerturbationExpand(Ae) && PerturbationValid(Ae, numProj)) + // Don't add invalid sets. + { + perturbationSets.push_back(Ae); // add expanded set to sets + minHeap.push( + std::make_pair(PerturbationScore(Ae, scores), + perturbationSets.size() - 1)); + } + + } while (!PerturbationValid(Ai, numProj));//Discard invalid perturbations + + // Found valid perturbation set Ai. Construct perturbation vector from set. + for (size_t pos = 0; pos < Ai.size(); ++pos) + { + // If Ai[pos] is marked, set template to +/- 1. + if (Ai[pos] == 1) + templateSequence(positions(pos), pvec) = actions(pos); + } + } +} + // Fit a curve to the data provided. template double LSHModel::DistanceStatisticPredictor::Train( @@ -266,7 +479,10 @@ double LSHModel::DistanceStatisticPredictor::Trai // Get an initial point from the optimizer. arma::mat currentPoint = f.GetInitialPoint(); + // Silence debug output of L_BFGS (TODO: remove) + Log::Debug.ignoreInput = true; double result = opt.Optimize(currentPoint); + Log::Debug.ignoreInput = false; // Optimizer is done - set alpha, beta, gamma. this->alpha = currentPoint(0, 0); From 78cab0be59414f3fa598bdadd61f4ed92742ece9 Mon Sep 17 00:00:00 2001 From: mentekid Date: Tue, 16 Aug 2016 11:02:11 +0100 Subject: [PATCH 08/18] Merges the LSH and LSHModel folders --- src/mlpack/methods/CMakeLists.txt | 1 - src/mlpack/methods/lsh/CMakeLists.txt | 5 +++++ .../methods/{lsh_model => lsh}/lshmodel.hpp | 0 .../{lsh_model => lsh}/lshmodel_impl.hpp | 0 .../{lsh_model => lsh}/lshmodel_main.cpp | 0 .../{lsh_model => lsh}/objectivefunction.hpp | 0 src/mlpack/methods/lsh_model/CMakeLists.txt | 19 ------------------- 7 files changed, 5 insertions(+), 20 deletions(-) rename src/mlpack/methods/{lsh_model => lsh}/lshmodel.hpp (100%) rename src/mlpack/methods/{lsh_model => lsh}/lshmodel_impl.hpp (100%) rename src/mlpack/methods/{lsh_model => lsh}/lshmodel_main.cpp (100%) rename src/mlpack/methods/{lsh_model => lsh}/objectivefunction.hpp (100%) delete mode 100644 src/mlpack/methods/lsh_model/CMakeLists.txt diff --git a/src/mlpack/methods/CMakeLists.txt b/src/mlpack/methods/CMakeLists.txt index 6a098340339..dbbd2318bee 100644 --- a/src/mlpack/methods/CMakeLists.txt +++ b/src/mlpack/methods/CMakeLists.txt @@ -36,7 +36,6 @@ set(DIRS local_coordinate_coding logistic_regression lsh - lsh_model # mvu matrix_completion naive_bayes diff --git a/src/mlpack/methods/lsh/CMakeLists.txt b/src/mlpack/methods/lsh/CMakeLists.txt index 3540e04c7b9..2660fdb3df2 100644 --- a/src/mlpack/methods/lsh/CMakeLists.txt +++ b/src/mlpack/methods/lsh/CMakeLists.txt @@ -4,6 +4,9 @@ set(SOURCES # LSH-search class lsh_search.hpp lsh_search_impl.hpp + # LSH-model class + lshmodel.hpp + lshmodel_impl.hpp ) # Add directory name to sources. @@ -18,3 +21,5 @@ set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) # The code to compute the approximate neighbor for the given query and reference # sets with p-stable LSH. add_cli_executable(lsh) +# The code that models LSH to make parameter tuning easier. +add_cli_executable(lshmodel) diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh/lshmodel.hpp similarity index 100% rename from src/mlpack/methods/lsh_model/lshmodel.hpp rename to src/mlpack/methods/lsh/lshmodel.hpp diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp similarity index 100% rename from src/mlpack/methods/lsh_model/lshmodel_impl.hpp rename to src/mlpack/methods/lsh/lshmodel_impl.hpp diff --git a/src/mlpack/methods/lsh_model/lshmodel_main.cpp b/src/mlpack/methods/lsh/lshmodel_main.cpp similarity index 100% rename from src/mlpack/methods/lsh_model/lshmodel_main.cpp rename to src/mlpack/methods/lsh/lshmodel_main.cpp diff --git a/src/mlpack/methods/lsh_model/objectivefunction.hpp b/src/mlpack/methods/lsh/objectivefunction.hpp similarity index 100% rename from src/mlpack/methods/lsh_model/objectivefunction.hpp rename to src/mlpack/methods/lsh/objectivefunction.hpp diff --git a/src/mlpack/methods/lsh_model/CMakeLists.txt b/src/mlpack/methods/lsh_model/CMakeLists.txt deleted file mode 100644 index c3799753aec..00000000000 --- a/src/mlpack/methods/lsh_model/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -# Define the files we need to compile. -# Anything not in this list will not be compiled into mlpack. -set(SOURCES - # LSH-model class - lshmodel.hpp - lshmodel_impl.hpp -) - -# Add directory name to sources. -set(DIR_SRCS) -foreach(file ${SOURCES}) - set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) -endforeach() -# Append sources (with directory name) to list of all mlpack sources (used at -# the parent scope). -set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) - -# The code that models LSH to return a set of parameters that works well. -add_cli_executable(lshmodel) From 2c88406a752ae797089cb8c617cc698963fe8500 Mon Sep 17 00:00:00 2001 From: mentekid Date: Tue, 16 Aug 2016 11:10:25 +0100 Subject: [PATCH 09/18] Removes LogDebug.ignoreInput statement --- src/mlpack/methods/lsh/lshmodel_impl.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp index b2de464eb6c..476a89d0c85 100644 --- a/src/mlpack/methods/lsh/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp @@ -480,9 +480,7 @@ double LSHModel::DistanceStatisticPredictor::Trai // Get an initial point from the optimizer. arma::mat currentPoint = f.GetInitialPoint(); // Silence debug output of L_BFGS (TODO: remove) - Log::Debug.ignoreInput = true; double result = opt.Optimize(currentPoint); - Log::Debug.ignoreInput = false; // Optimizer is done - set alpha, beta, gamma. this->alpha = currentPoint(0, 0); From f4af3dc878f5f9681078cee7ec678557eace455c Mon Sep 17 00:00:00 2001 From: mentekid Date: Sun, 21 Aug 2016 13:47:13 +0100 Subject: [PATCH 10/18] Separates DistanceStatisticPredictor implementation --- .../lsh/distance_statistic_predictor.hpp | 125 ++++++++++ src/mlpack/methods/lsh/lshmodel.hpp | 153 +++++------- src/mlpack/methods/lsh/lshmodel_impl.hpp | 233 +++++++++++++----- src/mlpack/methods/lsh/lshmodel_main.cpp | 7 +- 4 files changed, 355 insertions(+), 163 deletions(-) create mode 100644 src/mlpack/methods/lsh/distance_statistic_predictor.hpp diff --git a/src/mlpack/methods/lsh/distance_statistic_predictor.hpp b/src/mlpack/methods/lsh/distance_statistic_predictor.hpp new file mode 100644 index 00000000000..da1caf5a699 --- /dev/null +++ b/src/mlpack/methods/lsh/distance_statistic_predictor.hpp @@ -0,0 +1,125 @@ +/** + * @file distance_statistic_predictor.hpp + * @author Yannis Mentekidis + * + * This file defines a helper class that uses the function a * k^b * N^c for + * some parameters a, b, c that have been fit to either predict the arithmetic + * or geometric mean of the squared distance of a point to its k-nearest + * neighbor, given some dataset size N and its k-nearest neighbor. + * + * DistanceStatisticPredictor objects are used by the LSHModel class of mlpack. + */ +#ifndef MLPACK_METHODS_NEIGHBOR_SEARCH_DISTANCE_STATISTIC_PREDICTOR_HPP +#define MLPACK_METHODS_NEIGHBOR_SEARCH_DISTANCE_STATISTIC_PREDICTOR_HPP + +// For curve fitting. +#include +// Default objective function. +#include "objectivefunction.hpp" + +namespace mlpack +{ +namespace neighbor +{ + +template +class DistanceStatisticPredictor +{ + public: + //! Empty constructor. + DistanceStatisticPredictor() { }; + + /** + * Function to construct with training set. + * + * @param inputSize A vector of input sizes. The first input variable of + * the regression. + * @param kValues A vector of k values. The second input variable of the + * regression. + * @param statistic A vector of responses - the value of the statistic for + * each given inputSize. + */ + DistanceStatisticPredictor(const arma::Col& inputSize, + const arma::Col& kValues, + const arma::mat& statistic) + { Train(inputSize, kValues, statistic); }; + + //! Default destructor. + ~DistanceStatisticPredictor() { }; + + /** + * Function that fits the alpha, beta and gamma parameters. + * + * @param inputSize A vector of input sizes. The first input variable of + * the regression. + * @param kValues A vector of k values. The second input variable of the + * regression. + * @param statistic A vector of responses - the value of the statistic for + * each given inputSize. + */ + double Train(const arma::Col& inputSize, + const arma::Col& kValues, + const arma::mat& statistic); + + /** + * Evaluate the statistic for a given dataset size. + * + * @param N - a new input size for which to evaluate the expected + * statistic. + */ + double Predict(size_t N, size_t k) + { return alpha * std::pow(k, beta) * std::pow(N, gamma); }; + + //! Set the alpha parameter. + void Alpha(double a) { alpha = a; }; + + //! Get the alpha parameter. + double Alpha(void) { return alpha; }; + + //! Set the beta parameter. + void Beta(double b) { beta = b; }; + + //! Get the beta parameter. + double Beta(void) { return beta; }; + + //! Set the gamma parameter. + void Gamma(double c) { gamma = c; }; + + //! Get the gamma parameter. + double Gamma(void) { return gamma; }; + + private: + double alpha; + double beta; + double gamma; +}; + +// Fit a curve to the data provided. +template +double DistanceStatisticPredictor::Train( + const arma::Col& inputSize, + const arma::Col& kValues, + const arma::mat& statistic) +{ + // Objective function for fitting the E(x, k) curve to the statistic. + ObjectiveFunction f(inputSize, kValues, statistic); + + // Optimizer. Use L_BFGS (TODO: Make this a template parameter?) + mlpack::optimization::L_BFGS opt(f); + + // Get an initial point from the optimizer. + arma::mat currentPoint = f.GetInitialPoint(); + double result = opt.Optimize(currentPoint); + + // Optimizer is done - set alpha, beta, gamma. + this->alpha = currentPoint(0, 0); + this->beta = currentPoint(1, 0); + this->gamma = currentPoint(2, 0); + + return result; +} + +} // namespace neighbor +} // namespace mlpack + +#endif diff --git a/src/mlpack/methods/lsh/lshmodel.hpp b/src/mlpack/methods/lsh/lshmodel.hpp index ca146f4d503..1c839d6ebba 100644 --- a/src/mlpack/methods/lsh/lshmodel.hpp +++ b/src/mlpack/methods/lsh/lshmodel.hpp @@ -45,12 +45,10 @@ // For template parameters and kNN search (if nescessary). #include #include -// For curve fitting. -#include -// Default objective function. -#include "objectivefunction.hpp" // Gamma distribution for modeling squared distances. #include +// For fitting distance statistic regressors. +#include "distance_statistic_predictor.hpp" namespace mlpack { namespace neighbor { @@ -111,8 +109,9 @@ class LSHModel const size_t maxKValue = 32); /** - * Predict() finds LSH parameters that should work well for the dataset the - * LSHModel was trained for. + * This function uses the trained model to predict recall / selectivity + * values for a given parameter set. + * * Warning: If the k specified is larger than the maxKValue passed to * Train(), Train() will be called again. This might have adverse effects to * performance. @@ -123,7 +122,12 @@ class LSHModel */ void Predict(const size_t datasetSize, const size_t k, - const double minRecall); + const size_t numTables, + const size_t numProj, + const size_t numProbes, + const double hashWidth, + double& predictedRecall, + double& predictedSelect); /** * This function returns an LSHSearch object trained with the parameters @@ -200,29 +204,59 @@ class LSHModel * @param numProj The number of projections for the sequence under validation. */ bool PerturbationValid(const std::vector& A, size_t numProj) const; + /** * Function that creates a template perturbation sequence given a value for * an M and a W. The template perturbation sequence is based on the * statistical properties of multi-probe LSH and uses those, instead of - * specific points, to generate scores. + * specific points, to generate scores. The template sequence is also + * independent of the hashWidth, and depends only on numProj and numProbes. + * * See mlpack/methods/lsh/lsh_search_impl.hpp for more details about how * perturbation sequences are generated from specific points. * * @param numProj The number of projections for the LSH scheme for which we * want to compute the template perturbation sequence. - * @param hashWidth The hash width for the LSH scheme. * @param numProbes The number of probes to generate. */ void GenerateTemplateSequence(size_t numProj, - double hashWidth, size_t numProbes); - /** Matrix that stores, in each column, the "direction" of the perturbation: - * 0 means no perturbation on that dimension, -1 means reduce dimension value - * by 1, and +1 means increase dimension value by 1. + /** + * This function evaluates the probability that two points that are at + * distance chi from each other will be neighbors when we use LSH with a + * specific number of projections, probing bins, and tables for a given hash + * width. + * + * @param chi The distance of two points. + * @param hashWidth The first-level hash width. + * @param numTables The number of random projection tables used by LSH. + * @param numProj The number of projections per hash table (dimensionality of + * new space). + * @param numProbes The number of additional probing bins of Multiprobe LSH. */ - - arma::Mat templateSequence; + //TODO: inline? + double Rho(double chi, + double hashWidth, + size_t numTables, + size_t numProj, + size_t numProbes); + /** + * This is a helper function that is called by Rho() and returns the inner + * value of the product used in the calculation of the probability that Rho + * calculates. + * + * @param chi The distance of two points. + * @param hashWidth The first-level hash width. + * @param delta The perturbation to evaluate for. + * @param proj The projection we evaluate for ( 0 <= proj < numProj). + * @param numProj The total number of projections. + */ + inline double SameBucketProbability(double chi, + double hashWidth, + short delta, + size_t proj, + size_t numProj); /** * Function that fits two DistanceStatisticPredictors - one @@ -242,88 +276,19 @@ class LSHModel const arma::mat& Ek, const arma::mat& Gk); - /** - * This is a helper class that uses the function a * k^b * N^c for some - * parameters a, b, c that have been fit to either predict the arithmetic or - * geometric mean of the squared distance of a point to its k-nearest - * neighbor, given some dataset size N and its k-nearest neighbor. + + /** + * Matrix that stores, in each column, the "direction" of the perturbation: + * 0 means no perturbation on that dimension, -1 means reduce dimension value + * by 1, and +1 means increase dimension value by 1. */ - class DistanceStatisticPredictor - { - public: - //! Empty constructor. - DistanceStatisticPredictor() { }; - - /** - * Function to construct with training set. - * - * @param inputSize A vector of input sizes. The first input variable of - * the regression. - * @param kValues A vector of k values. The second input variable of the - * regression. - * @param statistic A vector of responses - the value of the statistic for - * each given inputSize. - */ - DistanceStatisticPredictor(const arma::Col& inputSize, - const arma::Col& kValues, - const arma::mat& statistic) - { Train(inputSize, kValues, statistic); }; - - //! Default destructor. - ~DistanceStatisticPredictor() { }; - - /** - * Function that fits the alpha, beta and gamma parameters. - * - * @param inputSize A vector of input sizes. The first input variable of - * the regression. - * @param kValues A vector of k values. The second input variable of the - * regression. - * @param statistic A vector of responses - the value of the statistic for - * each given inputSize. - */ - double Train(const arma::Col& inputSize, - const arma::Col& kValues, - const arma::mat& statistic); - - /** - * Evaluate the statistic for a given dataset size. - * - * @param N - a new input size for which to evaluate the expected - * statistic. - */ - double Predict(size_t N, size_t k) - { return alpha * std::pow(k, beta) * std::pow(N, gamma); }; - - //! Set the alpha parameter. - void Alpha(double a) { alpha = a; }; - - //! Get the alpha parameter. - double Alpha(void) { return alpha; }; - - //! Set the beta parameter. - void Beta(double b) { beta = b; }; - - //! Get the beta parameter. - double Beta(void) { return beta; }; - - //! Set the gamma parameter. - void Gamma(double c) { gamma = c; }; - - //! Get the gamma parameter. - double Gamma(void) { return gamma; }; - - private: - double alpha; - double beta; - double gamma; - }; + arma::Mat templateSequence; //! DistanceStatisticPredictor for arithmetic mean. - DistanceStatisticPredictor aMeanPredictor; + DistanceStatisticPredictor aMeanPredictor; //! DistanceStatisticPredictor for geometric mean. - DistanceStatisticPredictor gMeanPredictor; + DistanceStatisticPredictor gMeanPredictor; //! (k+1)-dimensional gamma distribution for predicting squared distances. mlpack::distribution::GammaDistribution distancesDistribution; @@ -349,8 +314,8 @@ class LSHModel //! Reference dataset. const arma::mat* referenceSet; - //! LSHSearch Object Vector. - std::vector> lshObjectVector; + //! LSHSearch Object + LSHSearch trainedLSHObject; //! Statistic: average squared distance of points. double meanDist; diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp index 476a89d0c85..6ea549c9700 100644 --- a/src/mlpack/methods/lsh/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp @@ -8,6 +8,7 @@ #define MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_MODEL_IMPL_HPP #include "lshmodel.hpp" +#include // pdf and cdf needed //TODO: remove @@ -151,22 +152,23 @@ void LSHModel::Train( } -// Predict parameters for LSH that will have acceptable recall. +// Predict recall / selectivity for the given parameters. template void LSHModel::Predict(const size_t datasetSize, const size_t k, - const double minRecall) + const size_t numTables, + const size_t numProj, + const size_t numProbes, + const double hashWidth, + double& predictedRecall, + double& predictedSelect) { - // Sanity check. Recall can't be greater/equal to 1, or negative. - if (minRecall < 0 || minRecall >=1) - Log::Fatal << "Parameter minRecall must be in [0, 1)" << std::endl; - // If the object wasn't trained, die here. if (referenceSet == NULL) Log::Fatal << "Attempt to use Predict() on untrained Object. Exiting." << std::endl; - // Before proceeding, if requested K is larger than the k we trained with, + // Before proceeding, if requested k is larger than the k we trained with, // re-train the object. if (k > maxKValue) { @@ -203,12 +205,139 @@ void LSHModel::Predict(const size_t datasetSize, distancesDistribution.Train(logMeanVec, meanLogVec, meanVec); Timer::Stop("fitting_distributions"); - // See if works - //GenerateTemplateSequence(3, 0.5, 8); + // Step 7. Generate the Template Probing Sequence using the maximum number of + // projections and the maximum number of probes. + GenerateTemplateSequence(numProj, numProbes); + + // Step 8. Use formulas (19) and (20) from the paper to predict recall and + // selectivity, using LSHModel::Rho() and the distribution functions of the + // gammas we fit back in Step 6. + predictedRecall = 0.5; + predictedSelect = 0.5; +} + + +/* NOTE: My interpretation of the paper would result in this code, but LSHKIT's + * implementation is different. I'm commenting this out to try their way, and I + * might go back to this if I see both work the same. + +// Probability of two points being neighbors if they are at distance chi. +template +double LSHModel::Rho(double chi, + double hashWidth, + size_t numTables, + size_t numProj, + size_t numProbes) +{ + // Calculate the formula: + // 1 - {Prod{1 - Prod{same_bin_probability}}}^numTables, where: + // * same_bin_probability is calculated with the Value() function. + // * Prod{same_bin_probability} is stored in product. + // * Prod{1 - Prod{same_bin_probability}} is stored in rho. + + double rho = 1; + + // Row-major loop :(. TODO: Refactor to make column-major. + for (size_t proj = 0; proj < numProj; ++proj) + { + double product = 1; + for (size_t probe = 0; probe < numProbes; ++probe) + { + // Use perturbation value (proj, probe), i.e. \delta_{\mu, \tau} + product *= Value(chi, hashWidth, templateSequence(proj, probe), numProj); + } + + rho *= (1 - product); + } + + return 1 - std::pow(rho, numTables); +} + +// Probability of two points being neighbors if they are at distance chi. +template +double LSHModel::SameBucketProbability(double chi, + double hashWidth, + short delta, + size_t proj, + size_t numProj) +{ + if (delta == 0) + { + // No perturbation - probability of two queries sharing the same bin. + // Use the "default" normal distribution with mean = 0, sd = 1. + boost::math::normal_distribution phi; + return 2 * phi.pdf(hashWidth / chi) - 1 + + std::sqrt(2 / M_PI) + * (std::exp(-pow((hashWidth / chi), 2) / 2.0 - 1.0)) / (hashWidth / chi); + } + else + { + // +1/-1 perturbation - probability of two queries being in adjacent bins. + double deltaI = (proj + 1.0) / (2.0 * (numProj + 2.0)); + + // Negative perturbation - flip deltaI. + if (delta == -1) + deltaI = 1 - deltaI; + + boost::math::normal_distribution phi(-delta, chi); + return phi.cdf(hashWidth) - phi.cdf(0); + } +} +*/ + +/* + * Based on the LSHKIT implementation, not my understanding of the paper. + */ +// Probability of two points being neighbors if they are at distance chi. +template +double LSHModel::Rho(double chi, + double hashWidth, + size_t numTables, + size_t numProj, + size_t numProbes) +{ + double rho = 0; + + for (size_t probe = 0; probe < numProbes; ++probe) + { + double rTemp = 1; + for (size_t proj = 0; proj < numProj; ++proj) + { + rTemp *= SameBucketProbability(chi, hashWidth, + templateSequence(proj, probe), proj, numProj); + } + rho += rTemp; + } - // Step 7. Run Binary search on parameter space to minimize selectivity while - // keeping recall above minimum. + return 1 - std::exp(std::log(1.0 - rho) * numTables); +} +// Probability of two points being neighbors if they are at distance chi. +template +double LSHModel:: +SameBucketProbability(double chi, double hashWidth, short delta, size_t proj, + size_t numProj) +{ + boost::math::normal_distribution<> phi; + if (delta == 0) + { + // No perturbation - probability of two queries sharing the same bin. + return 2 * pdf(phi, hashWidth / chi) - 1 + + std::sqrt(2 / M_PI) + * (std::exp(-pow((hashWidth / chi), 2) / 2.0 - 1.0)) / (hashWidth / chi); + } + else + { + // +1/-1 perturbation - probability of two queries being in adjacent bins. + double deltaI = (proj + 1.0) / (2.0 * (numProj + 2.0)); + + // Negative perturbation - flip deltaI. + if (delta == -1) + deltaI = 1 - deltaI; + + return cdf(phi, hashWidth / chi * (1 + deltaI)) + - cdf(phi, hashWidth / chi * deltaI); + } } // Fit two predictors, one for arithmetic mean E and one for geometric mean G. @@ -254,9 +383,9 @@ LSHObject(const size_t numProjIn, LSHSearch<> lsh(*referenceSet, numProjOut, numTablesOut, hashWidthOut, secondHashSize, bucketSize); - lshObjectVector.push_back(lsh); + trainedLSHObject = lsh; - return lshObjectVector[lshObjectVector.size() - 1]; + return trainedLSHObject; } // Helper function to generate perturbations. @@ -348,30 +477,35 @@ bool LSHModel::PerturbationValid( template void LSHModel::GenerateTemplateSequence( size_t numProj, - double hashWidth, size_t numProbes) { - // If no additional probes requested, stop here. + // If no probes requested, stop here. if (numProbes == 0) + { + Log::Warn << "GenerateTemplateSequence called with numProbes = 0" + << std::endl; return; + } // If number of additional probes exceeds possible, set to max possible. - if (numProbes > ((1 << numProj) - 1)) - numProbes = (1 << numProj) - 1; + if (numProbes > pow(3, numProj)) + numProbes = pow(3, numProj); // {-1, 0, 1} for each probe. // Calculate the expected scores based on Multi-probe LSH paper. arma::vec scores(2 * numProj); - double M = (double) numProj; // To avoid integer division headache. - // "Positive" scores. - for (size_t j = 0; j < numProj; ++j) - scores(j) = pow(hashWidth, 2) * (j + 1 * (j + 2))/(4 * (M + 1) * (M + 2)); - // "Negative" scores. - for (size_t j = numProj; j < 2 * numProj; ++j) - scores(j) = pow(hashWidth, 2) * - (1 - - (2 * M + 1 - (j + 1))/(M + 1) + - ((2 * M + 1 - (j + 1)) * (2 * M + 2 - (j + 1)))/(4 * (M + 1) * (M + 2))); - cout << scores << endl; + double M = (double) numProj; + + // Generate expected scores in sorted order. + for (size_t i = 0; i < numProj; ++i) + { + // Everything is double to avoid integer division headache. + double left = double(i); + double right = 2 * M - left - 1; + + // Expected score - left boundary. + scores[left] = (left + 1) * (left + 2) / (2 * (M + 1) * (M + 2)); + scores[right] = 1 - (left + 1)/(M + 1) + scores[left]; + } // A "+1" signifies a positive perturbation, a "-1" a negative one. arma::Col actions(2 * numProj); // will be [1 ... -1 ...] @@ -388,14 +522,6 @@ void LSHModel::GenerateTemplateSequence( positions.rows(numProj, 2 * numProj - 1) = arma::linspace< arma::Col >(0, numProj - 1, numProj); - // Sort all three vectors so smaller scoring perturbations are first. - arma::uvec sortidx = arma::sort_index(scores); - scores = scores(sortidx); - actions = actions(sortidx); - positions = positions(sortidx); - - // From LSHSearch::GetAdditionalProbingBins. TODO: Modularize? - // Perturbation sets (A) mark with 1 the (score, action, dimension) positions // included in a given perturbation vector. Other spaces are 0. std::vector Ao(2 * numProj); @@ -415,13 +541,11 @@ void LSHModel::GenerateTemplateSequence( // Start by adding the lowest scoring set to the minheap. minHeap.push( std::make_pair(PerturbationScore(Ao, scores), 0) ); - // Loop invariable: after pvec iterations, additionalProbingBins contains pvec - // valid codes of the lowest-scoring bins (bins most likely to contain - // neighbors of the query). - // Allocate 1 column per perturbed "code". - this->templateSequence.zeros(numProj, numProbes); - for (size_t pvec = 0; pvec < numProbes; ++pvec) + templateSequence.zeros(numProj, numProbes); + + // Column 0 is all 0s. Fill columns 1:numProbes using Lv's algorithm. + for (size_t pvec = 1; pvec < numProbes; ++pvec) { std::vector Ai; do @@ -464,31 +588,6 @@ void LSHModel::GenerateTemplateSequence( } } -// Fit a curve to the data provided. -template -double LSHModel::DistanceStatisticPredictor::Train( - const arma::Col& inputSize, - const arma::Col& kValues, - const arma::mat& statistic) -{ - // Objective function for fitting the E(x, k) curve to the statistic. - ObjectiveFunction f(inputSize, kValues, statistic); - - // Optimizer. Use L_BFGS (TODO: Make this a template parameter?) - mlpack::optimization::L_BFGS opt(f); - - // Get an initial point from the optimizer. - arma::mat currentPoint = f.GetInitialPoint(); - // Silence debug output of L_BFGS (TODO: remove) - double result = opt.Optimize(currentPoint); - - // Optimizer is done - set alpha, beta, gamma. - this->alpha = currentPoint(0, 0); - this->beta = currentPoint(1, 0); - this->gamma = currentPoint(2, 0); - - return result; -} // Serialize the object and save to a file. template diff --git a/src/mlpack/methods/lsh/lshmodel_main.cpp b/src/mlpack/methods/lsh/lshmodel_main.cpp index 00cd2d03e57..20b9f0ee4f6 100644 --- a/src/mlpack/methods/lsh/lshmodel_main.cpp +++ b/src/mlpack/methods/lsh/lshmodel_main.cpp @@ -20,10 +20,13 @@ int main(int argc, char* argv[]) size_t d = 10; size_t k = 5; double sampleSize = 0.25; - double minRecall = 0.4; + double recall, selectivity; arma::mat rdata(d, N, arma::fill::randu); LSHModel<> model(rdata, sampleSize, k); - model.Predict(N, k, minRecall); + model.Predict(N, k, 16, 4, 4, 1.0, recall, selectivity); + + Log::Info << "Model predicts " << recall*100 << "\% recall and " + << selectivity*100 << "\% selectivity." << std::endl; arma::mat qdata(d, 1, arma::fill::randu); arma::Mat neighbors; From a73b46893eeaae340553de1e561e485424708fc3 Mon Sep 17 00:00:00 2001 From: mentekid Date: Sun, 21 Aug 2016 14:58:53 +0100 Subject: [PATCH 11/18] First working version (needs debugging) --- .../lsh/distance_statistic_predictor.hpp | 2 +- src/mlpack/methods/lsh/lshmodel.hpp | 95 ++++++++++++++++--- src/mlpack/methods/lsh/lshmodel_impl.hpp | 73 +++++++++++++- 3 files changed, 152 insertions(+), 18 deletions(-) diff --git a/src/mlpack/methods/lsh/distance_statistic_predictor.hpp b/src/mlpack/methods/lsh/distance_statistic_predictor.hpp index da1caf5a699..fc6b34a4b56 100644 --- a/src/mlpack/methods/lsh/distance_statistic_predictor.hpp +++ b/src/mlpack/methods/lsh/distance_statistic_predictor.hpp @@ -104,7 +104,7 @@ double DistanceStatisticPredictor::Train( // Objective function for fitting the E(x, k) curve to the statistic. ObjectiveFunction f(inputSize, kValues, statistic); - // Optimizer. Use L_BFGS (TODO: Make this a template parameter?) + // Optimizer. Use L_BFGS mlpack::optimization::L_BFGS opt(f); // Get an initial point from the optimizer. diff --git a/src/mlpack/methods/lsh/lshmodel.hpp b/src/mlpack/methods/lsh/lshmodel.hpp index 1c839d6ebba..5c250404bb7 100644 --- a/src/mlpack/methods/lsh/lshmodel.hpp +++ b/src/mlpack/methods/lsh/lshmodel.hpp @@ -88,7 +88,7 @@ class LSHModel * estimate parameters of the dataset. The estimated parameters are: * * Arithmetic mean of pairwise distances of random points in the sample. * * Geometric mean for the pairwise distnaces - * * Arithmetic mean of distance random point to its k-th nearest neighbor + * * Arithmetic mean of distance random point to its k-th nearest neighbor * as a function of |N|, the number of points. * * Geometric mean of the same distance. * @@ -104,7 +104,7 @@ class LSHModel * @param maxKValue The maximum number of nearest neighbors for each query to * train for. */ - void Train(const arma::mat& referenceSet, + void Train(const arma::mat& referenceSet, const double sampleRate = 0.1, const size_t maxKValue = 32); @@ -120,8 +120,8 @@ class LSHModel * @param k The number of k-nearest neighbors LSH must find. * @param minRecall The minimum acceptable recall we want to tune for. */ - void Predict(const size_t datasetSize, - const size_t k, + void Predict(const size_t datasetSize, + const size_t k, const size_t numTables, const size_t numProj, const size_t numProbes, @@ -219,7 +219,7 @@ class LSHModel * want to compute the template perturbation sequence. * @param numProbes The number of probes to generate. */ - void GenerateTemplateSequence(size_t numProj, + void GenerateTemplateSequence(size_t numProj, size_t numProbes); /** @@ -239,8 +239,8 @@ class LSHModel double Rho(double chi, double hashWidth, size_t numTables, - size_t numProj, - size_t numProbes); + size_t numProj, + size_t numProbes) const; /** * This is a helper function that is called by Rho() and returns the inner * value of the product used in the calculation of the probability that Rho @@ -252,11 +252,80 @@ class LSHModel * @param proj The projection we evaluate for ( 0 <= proj < numProj). * @param numProj The total number of projections. */ - inline double SameBucketProbability(double chi, - double hashWidth, + inline double SameBucketProbability(double chi, + double hashWidth, short delta, size_t proj, - size_t numProj); + size_t numProj) const; + + /** + * This function calculates the recall of LSH for a given set of parameters. + * It uses the function + * + * r = \frac{1}{K} \sum_{1}^{K} \int_{0}^{\infty}(Rho(\sqrt{x}) * f_k(x)) dx + * + * as proposed in the paper. + * + */ + double Recall(size_t maxK, + size_t numTables, + size_t numProj, + size_t numProbes, + double hashWidth); + + /** + * This function calculates the selectivity of LSH for a given set of parameters. + * It uses the function + * + * s = \int_{0}^{\infty}(Rho(\sqrt{x}) * f(x)) dx + * + * as proposed in the paper. + * + */ + double Selectivity(size_t numTables, + size_t numProj, + size_t numProbes, + double hashWidth); + + /** + * Helper class for boost::integration. + */ + class IntegralObjective + { + public: + // Initialize everything. + IntegralObjective(const size_t k, + const size_t numTables, + const size_t numProj, + const size_t numProbes, + const double hashWidth, + const mlpack::distribution::GammaDistribution* gamma, + const LSHModel* model) + : k(k), numTables(numTables), numProj(numProj), + numProbes(numProbes), hashWidth(hashWidth), gamma(gamma), model(model) + { /* do nothing */}; + + ~IntegralObjective() { }; + + // Use as function with the operator () and one argument. + double operator()(const double& chi) const + { + return + (model->Rho(std::sqrt(chi), hashWidth, numTables, numProj, numProbes)) + * (gamma->Probability(chi, k)); + } + + private: + const size_t k; + const size_t numTables; + const size_t numProj; + const size_t numProbes; + const double hashWidth; + + const mlpack::distribution::GammaDistribution* gamma; + const LSHModel* model; + + }; /** * Function that fits two DistanceStatisticPredictors - one @@ -271,13 +340,13 @@ class LSHModel * @param Gk The geometric mean of the squared distances of a point and its * k-nearest neighbor. One column per k. */ - void ApproximateKNNStatistics(const arma::Col& referenceSizes, + void ApproximateKNNStatistics(const arma::Col& referenceSizes, const arma::Col& kValues, - const arma::mat& Ek, + const arma::mat& Ek, const arma::mat& Gk); - /** + /** * Matrix that stores, in each column, the "direction" of the perturbation: * 0 means no perturbation on that dimension, -1 means reduce dimension value * by 1, and +1 means increase dimension value by 1. diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp index 6ea549c9700..400d46538e4 100644 --- a/src/mlpack/methods/lsh/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp @@ -212,10 +212,75 @@ void LSHModel::Predict(const size_t datasetSize, // Step 8. Use formulas (19) and (20) from the paper to predict recall and // selectivity, using LSHModel::Rho() and the distribution functions of the // gammas we fit back in Step 6. - predictedRecall = 0.5; - predictedSelect = 0.5; + predictedRecall = Recall(k, numTables, numProj, numProbes, hashWidth); + predictedSelect = Selectivity(numTables, numProj, numProbes, hashWidth); } +// Uses paper's formula (19) to predict recall. +template +double LSHModel::Recall(size_t maxK, + size_t numTables, + size_t numProj, + size_t numProbes, + double hashWidth) +{ + double recall = 0; + + // Loop over k values, accumulating the probabilities. Then take average. + // k starts from one because distancesDistribution(0) is the "simple" pairwise + // distances distribution. + for (size_t k = 1; k < maxK + 1; k++) + { + // Create a helper object for this value of k. + IntegralObjective f(k, numTables, numProj, numProbes, hashWidth, + &distancesDistribution, this); + + // TODO: change with boost integration. + double from = 0; + double to = 1000; + double step = 0.01; + double integralSum = 0; + for (double i = from+step; i < to; i+=step) + { + double temp = f(i); + if (temp > 0) + integralSum += temp; // Use as function thanks to operator(). + else + break; // Gamma distribution == 0 means we're past the tail. + } + recall += integralSum * step ; + } + return recall / double(maxK); +} + +// Uses paper's formula (20) to compute selectivity +template +double LSHModel::Selectivity(size_t numTables, + size_t numProj, + size_t numProbes, + double hashWidth) +{ + + // Create a helper object for k = 0 (pairwise distances). + IntegralObjective f(0, numTables, numProj, numProbes, hashWidth, + &distancesDistribution, this); + + // TODO: change with boost integration. + double from = 0; + double to = 1000; + double step = 0.01; + double integralSum = 0; + for (double i = from+step; i < to; i+=step) + { + double temp = f(i); + if (temp > 0) + integralSum += temp; // Use as function thanks to operator(). + else + break; // Gamma distribution == 0 means we're past the tail. + } + + return integralSum * step ; +} /* NOTE: My interpretation of the paper would result in this code, but LSHKIT's * implementation is different. I'm commenting this out to try their way, and I @@ -294,7 +359,7 @@ double LSHModel::Rho(double chi, double hashWidth, size_t numTables, size_t numProj, - size_t numProbes) + size_t numProbes) const { double rho = 0; @@ -316,7 +381,7 @@ double LSHModel::Rho(double chi, template double LSHModel:: SameBucketProbability(double chi, double hashWidth, short delta, size_t proj, - size_t numProj) + size_t numProj) const { boost::math::normal_distribution<> phi; if (delta == 0) From dd2bdf70e4047b0da4a4e178fb11c3300630077a Mon Sep 17 00:00:00 2001 From: mentekid Date: Tue, 23 Aug 2016 16:53:50 +0300 Subject: [PATCH 12/18] Adds command line interface options to lshmodel --- src/mlpack/methods/lsh/lshmodel_impl.hpp | 72 +----------------------- src/mlpack/methods/lsh/lshmodel_main.cpp | 65 +++++++++++++++------ 2 files changed, 50 insertions(+), 87 deletions(-) diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp index 400d46538e4..28cb6741564 100644 --- a/src/mlpack/methods/lsh/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp @@ -238,7 +238,7 @@ double LSHModel::Recall(size_t maxK, // TODO: change with boost integration. double from = 0; double to = 1000; - double step = 0.01; + double step = 0.001; double integralSum = 0; for (double i = from+step; i < to; i+=step) { @@ -268,7 +268,7 @@ double LSHModel::Selectivity(size_t numTables, // TODO: change with boost integration. double from = 0; double to = 1000; - double step = 0.01; + double step = 0.001; double integralSum = 0; for (double i = from+step; i < to; i+=step) { @@ -282,74 +282,6 @@ double LSHModel::Selectivity(size_t numTables, return integralSum * step ; } -/* NOTE: My interpretation of the paper would result in this code, but LSHKIT's - * implementation is different. I'm commenting this out to try their way, and I - * might go back to this if I see both work the same. - -// Probability of two points being neighbors if they are at distance chi. -template -double LSHModel::Rho(double chi, - double hashWidth, - size_t numTables, - size_t numProj, - size_t numProbes) -{ - // Calculate the formula: - // 1 - {Prod{1 - Prod{same_bin_probability}}}^numTables, where: - // * same_bin_probability is calculated with the Value() function. - // * Prod{same_bin_probability} is stored in product. - // * Prod{1 - Prod{same_bin_probability}} is stored in rho. - - double rho = 1; - - // Row-major loop :(. TODO: Refactor to make column-major. - for (size_t proj = 0; proj < numProj; ++proj) - { - double product = 1; - for (size_t probe = 0; probe < numProbes; ++probe) - { - // Use perturbation value (proj, probe), i.e. \delta_{\mu, \tau} - product *= Value(chi, hashWidth, templateSequence(proj, probe), numProj); - } - - rho *= (1 - product); - } - - return 1 - std::pow(rho, numTables); -} - -// Probability of two points being neighbors if they are at distance chi. -template -double LSHModel::SameBucketProbability(double chi, - double hashWidth, - short delta, - size_t proj, - size_t numProj) -{ - if (delta == 0) - { - // No perturbation - probability of two queries sharing the same bin. - // Use the "default" normal distribution with mean = 0, sd = 1. - boost::math::normal_distribution phi; - return 2 * phi.pdf(hashWidth / chi) - 1 - + std::sqrt(2 / M_PI) - * (std::exp(-pow((hashWidth / chi), 2) / 2.0 - 1.0)) / (hashWidth / chi); - } - else - { - // +1/-1 perturbation - probability of two queries being in adjacent bins. - double deltaI = (proj + 1.0) / (2.0 * (numProj + 2.0)); - - // Negative perturbation - flip deltaI. - if (delta == -1) - deltaI = 1 - deltaI; - - boost::math::normal_distribution phi(-delta, chi); - return phi.cdf(hashWidth) - phi.cdf(0); - } -} -*/ - /* * Based on the LSHKIT implementation, not my understanding of the paper. */ diff --git a/src/mlpack/methods/lsh/lshmodel_main.cpp b/src/mlpack/methods/lsh/lshmodel_main.cpp index 20b9f0ee4f6..88e79410bfa 100644 --- a/src/mlpack/methods/lsh/lshmodel_main.cpp +++ b/src/mlpack/methods/lsh/lshmodel_main.cpp @@ -3,36 +3,67 @@ #include "lshmodel.hpp" +using std::string; using std::endl; using std::cout; using namespace mlpack; using namespace mlpack::neighbor; -PROGRAM_INFO("LSH Model (TODO: Complete this)", ""); +PROGRAM_INFO("LSH Modeling and Tuning", + "This program can help tune parameters for the LSH algorithm for" + " approximate nearest neighbor search. Currently, the only option is to" + " specify a number of the four parameters (numTables, numProj, numProbes," + " hashWidth) and receive an estimate of LSH's recall and selectivity for a" + " given dataset." + ); PARAM_STRING_IN("reference_file", "File containing the dataset", "r", ""); -PARAM_STRING_OUT("output_model_file", "File to save trained LSH model to", "m"); +PARAM_DOUBLE_IN("sample_percentage", "Sample size percentage. Must be in (0, 1]", "p", 0.0) + +PARAM_INT_IN("neighbors", "The number of nearest neighbors LSH will search for", "k", 1); +PARAM_INT_IN("tables", "The number of tables for LSH", "L", 30); +PARAM_INT_IN("projections", "The number of projections per table for LSH", "K", 10); +PARAM_INT_IN("probes", "The number of probes for multiprobe LSH", "T", 0); +PARAM_DOUBLE_IN("hash_width", "The hash width for the first level hashing", "H", 1.0); + +//PARAM_STRING_OUT("output_model_file", "File to save trained LSH model to", "m"); int main(int argc, char* argv[]) { CLI::ParseCommandLine(argc, argv); - // Generate a random point set. - size_t N = 5000; - size_t d = 10; - size_t k = 5; - double sampleSize = 0.25; + // If no input file was specified, die here. + if (!CLI::HasParam("reference_file")) + Log::Fatal << "You need to specify the reference file." << endl; + // Read input file name + string rfile = CLI::GetParam("reference_file"); + // Attempt to read file. + arma::mat rdata; + data::Load(rfile, rdata, true); // true: if you can't open file, die. + size_t N = rdata.n_cols; // Dataset size. + + // Parse rest of command line input. + size_t k = CLI::GetParam("neighbors"); + size_t numTables = CLI::GetParam("tables"); + size_t numProj = CLI::GetParam("projections"); + size_t numProbes = CLI::GetParam("probes"); + double hashWidth = CLI::GetParam("hash_width"); + double sampleSize = CLI::GetParam("sample_percentage"); + if (sampleSize == 0.0) + Log::Fatal << "You need to specify the sampling percentage." << endl; + + Log::Info << + "Tuning LSH for" << std::endl + <<"\t numTables = " << numTables << std::endl + <<"\t numProj = " << numProj << std::endl + <<"\t numProbes = " << numProbes << std::endl + <<"\t hashWidth = " << hashWidth << std::endl; + double recall, selectivity; - arma::mat rdata(d, N, arma::fill::randu); - LSHModel<> model(rdata, sampleSize, k); - model.Predict(N, k, 16, 4, 4, 1.0, recall, selectivity); - Log::Info << "Model predicts " << recall*100 << "\% recall and " - << selectivity*100 << "\% selectivity." << std::endl; + LSHModel<> model(rdata, sampleSize, k); + model.Predict(N, k, numTables, numProj, numProbes, hashWidth, recall, selectivity); - arma::mat qdata(d, 1, arma::fill::randu); - arma::Mat neighbors; - arma::mat distances; - LSHSearch<> lsh = model.LSHObject(1, 1, 1.0, 99901, 500); - lsh.Search(qdata, 1, neighbors, distances); + cout << "Model predicts " << recall*100 << "\% recall and " + << selectivity*100 << "\% selectivity." << endl; return 0; } From 57c9d5e634d7d3d7e2ca1618353fe37d9e23b34a Mon Sep 17 00:00:00 2001 From: mentekid Date: Tue, 23 Aug 2016 17:49:44 +0300 Subject: [PATCH 13/18] Changes way kNN samples are generated --- src/mlpack/methods/lsh/lshmodel_impl.hpp | 44 ++++++++++++++---------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp index 28cb6741564..dbdcaebe943 100644 --- a/src/mlpack/methods/lsh/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp @@ -104,36 +104,43 @@ void LSHModel::Train( // through exponentiating the mean of the logarithms of x: // exp(mean(log(x))) = geometricmean(x). - // Number of samples to create for modeling the Gamma Distributions - size_t regressionExamples = 50; // TODO: parameter? - - // Number of points to use as queries. - size_t numAnchors = (size_t) std::round(0.1 * numSamples); - arma::mat queryMat = sampleSet.cols(0, numAnchors - 1); - // Evenly spaced sample sizes. - arma::Col referenceSizes = arma::conv_to< arma::Col >::from( - arma::linspace(numAnchors, numSamples - numAnchors - 1, - regressionExamples)); + // Number of points to use as queries. Use 10% of sample. + double anchorsSample = 0.1; + size_t numAnchors = (size_t) std::round(anchorsSample * numSamples); // Statistics - Arithmetic and geometric means for growing reference set. // Compute one of each for each k. + size_t regressionExamples = size_t( + std::round((1.0 - anchorsSample) / anchorsSample)); arma::mat Ek(regressionExamples, k); arma::mat Gk(regressionExamples, k); - // For each referenceSize, calculate the kNN of the anchors + // For each reference size, calculate the kNN of the anchors. Divide reference + // set into equal blocks (block 1 is anchors). In repetition 1, use block 2 as + // reference set, in repetition 2, blocks 2 and 3, and so on. + size_t refSetStart = numAnchors; + size_t refSetEnd = numAnchors; + arma::mat queryMat = sampleSet.cols(0, numAnchors - 1); + arma::Col referenceSizes(regressionExamples); + Log::Info.ignoreInput = true; // Ignore kNN output. + // TODO: Since we've already computed this, avoid calling kNN? for (size_t i = 0; i < regressionExamples; ++i) { - // TODO: Since we've already computed this, avoid calling kNN? + refSetEnd += refSetStart - 1; + + cout << "Neighbors "<< refSetStart <<":"< neighbors; // Not going to be used but required. arma::mat kNNDistances; // What we need. KNN naive(refMat, true); // true: train and use naive kNN. naive.Search(queryMat, k, neighbors, kNNDistances); + + // Store the squared distances (what we need). kNNDistances = arma::pow(kNNDistances, 2); // Compute Arithmetic and Geometric mean of the distances. @@ -206,14 +213,15 @@ void LSHModel::Predict(const size_t datasetSize, Timer::Stop("fitting_distributions"); // Step 7. Generate the Template Probing Sequence using the maximum number of - // projections and the maximum number of probes. - GenerateTemplateSequence(numProj, numProbes); + // projections and the maximum number of probes. +1 because 0 additional + // probes means 1 probe total. + GenerateTemplateSequence(numProj, numProbes + 1); // Step 8. Use formulas (19) and (20) from the paper to predict recall and // selectivity, using LSHModel::Rho() and the distribution functions of the // gammas we fit back in Step 6. - predictedRecall = Recall(k, numTables, numProj, numProbes, hashWidth); - predictedSelect = Selectivity(numTables, numProj, numProbes, hashWidth); + predictedRecall = Recall(k, numTables, numProj, numProbes + 1, hashWidth); + predictedSelect = Selectivity(numTables, numProj, numProbes + 1, hashWidth); } // Uses paper's formula (19) to predict recall. From a0626a8c41fedc60ec255ef2939a519dfac5b83a Mon Sep 17 00:00:00 2001 From: mentekid Date: Wed, 24 Aug 2016 19:14:53 +0300 Subject: [PATCH 14/18] Prevents log(0) which causes errors. Infinite loop still happens occasionaly --- src/mlpack/methods/lsh/lshmodel_impl.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp index dbdcaebe943..c4c2f3a4ea5 100644 --- a/src/mlpack/methods/lsh/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp @@ -129,8 +129,6 @@ void LSHModel::Train( { refSetEnd += refSetStart - 1; - cout << "Neighbors "<< refSetStart <<":"<::Train( KNN naive(refMat, true); // true: train and use naive kNN. naive.Search(queryMat, k, neighbors, kNNDistances); + // If identical points are found, disregard their distance to avoid log(0). + kNNDistances = kNNDistances.cols(arma::find(kNNDistances > 0)); + // Store the squared distances (what we need). kNNDistances = arma::pow(kNNDistances, 2); @@ -147,6 +148,8 @@ void LSHModel::Train( Ek.row(i) = arma::mean(kNNDistances.t()); Gk.row(i) = arma::exp(arma::mean(arma::log(kNNDistances.t()), 0)); } + cout << Ek << endl; + cout << Gk << endl; Log::Info.ignoreInput = false; // Keep giving normal output. // Step 5. Model the arithmetic and geometric mean according to the paper. From cc1b6910d9ed28b5ab8d1911b9af4642ca8475e7 Mon Sep 17 00:00:00 2001 From: mentekid Date: Tue, 30 Aug 2016 10:09:48 +0300 Subject: [PATCH 15/18] Removes parameterized objective function constructor --- src/mlpack/methods/lsh/objectivefunction.hpp | 22 ++++---------------- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/src/mlpack/methods/lsh/objectivefunction.hpp b/src/mlpack/methods/lsh/objectivefunction.hpp index 2aa60ab8054..6ddc51c8177 100644 --- a/src/mlpack/methods/lsh/objectivefunction.hpp +++ b/src/mlpack/methods/lsh/objectivefunction.hpp @@ -28,20 +28,6 @@ class DefaultObjectiveFunction //! Default constructor - do nothing. DefaultObjectiveFunction() { }; - /** - * Parameterized constructor. - * - * @param xData Vector of x - the sizes of the reference set when performing - * kNN. - * @param kData Vector of k - the kth nearest neighbor for which we - * calculated the statistic. - * @param yData Matrix of y, one for each (x, k) value. - */ - DefaultObjectiveFunction(const arma::Col& xData, - const arma::Col& kData, - const arma::mat& yData) - : xData(&xData), kData(&kData), yData(&yData) { }; - //! Return the number of functions size_t NumFunctions(void) const { return yData->n_elem; } @@ -86,10 +72,10 @@ class DefaultObjectiveFunction double DefaultObjectiveFunction::Evaluate(const arma::mat& coordinates) const { // Use extra variables to make code readable. - double alpha = coordinates(0, 0); - double beta = coordinates(1, 0); - double gamma = coordinates(2, 0); - double M = (double) NumFunctions(); + const double alpha = coordinates(0, 0); + const double beta = coordinates(1, 0); + const double gamma = coordinates(2, 0); + const double M = (double) NumFunctions(); // Sum the squared error for each element in yData. double sum = 0; From fb308e80d00990f7ee697cbb26e74af045aa4bf5 Mon Sep 17 00:00:00 2001 From: mentekid Date: Tue, 30 Aug 2016 13:59:34 +0300 Subject: [PATCH 16/18] Solves NaN values issue --- src/mlpack/methods/lsh/lshmodel_impl.hpp | 39 +++++++++++++++----- src/mlpack/methods/lsh/objectivefunction.hpp | 22 +++++++++-- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp index c4c2f3a4ea5..1a66377e03f 100644 --- a/src/mlpack/methods/lsh/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp @@ -84,6 +84,21 @@ void LSHModel::Train( for (size_t j = i + 1; j < numSamples; ++j) distances(d++) = metric::EuclideanDistance::Evaluate( sampleSet.unsafe_col(i), sampleSet.unsafe_col(j)); + + // We need to take the logarithm of these distances, so replace the 0s with + // very small values. + // Find smallest value + double smallest = DBL_MAX; + for (size_t i = 0; i < d; ++i) + if (distances(i) < smallest && distances(i) > 0) + smallest = distances(i); + + // Replace 0s with fraction of smallest value. + for (size_t i = 0; i < d; ++i) + if (distances(i) == 0) + distances(i) = 1e-5 * smallest; + + Log::Info << "Computed " << d << " pointwise distances." << std::endl; Timer::Stop("pairwise_distances"); @@ -112,6 +127,8 @@ void LSHModel::Train( // Compute one of each for each k. size_t regressionExamples = size_t( std::round((1.0 - anchorsSample) / anchorsSample)); + + // store statistics for the distances. arma::mat Ek(regressionExamples, k); arma::mat Gk(regressionExamples, k); @@ -138,18 +155,19 @@ void LSHModel::Train( KNN naive(refMat, true); // true: train and use naive kNN. naive.Search(queryMat, k, neighbors, kNNDistances); - // If identical points are found, disregard their distance to avoid log(0). - kNNDistances = kNNDistances.cols(arma::find(kNNDistances > 0)); + // Replace 0s again. + for (size_t c = 0; c < kNNDistances.n_cols; ++c) + for (size_t r = 0; r < kNNDistances.n_rows; ++r) + if (kNNDistances(r, c) == 0) + kNNDistances(r, c) = 1e-5 * smallest; // Store the squared distances (what we need). kNNDistances = arma::pow(kNNDistances, 2); // Compute Arithmetic and Geometric mean of the distances. - Ek.row(i) = arma::mean(kNNDistances.t()); - Gk.row(i) = arma::exp(arma::mean(arma::log(kNNDistances.t()), 0)); + Ek.row(i) = arma::mean(kNNDistances, 1).t(); + Gk.row(i) = arma::exp(arma::mean(arma::log(kNNDistances), 1)).t(); } - cout << Ek << endl; - cout << Gk << endl; Log::Info.ignoreInput = false; // Keep giving normal output. // Step 5. Model the arithmetic and geometric mean according to the paper. @@ -199,8 +217,9 @@ void LSHModel::Predict(const size_t datasetSize, meanVec(0) = this->meanDist; logMeanVec(0) = this->logMeanDist; meanLogVec(0) = this->meanLogDist; - // Train gamma and put in gammaDists[0]. + // Use the trained predictors (Step 5) to predict arithmetic and geometric + // means for each k value. Timer::Start("fitting_distributions"); for (size_t i = 1; i <= k; ++i) { @@ -211,13 +230,13 @@ void LSHModel::Predict(const size_t datasetSize, // log(geometricMean) = \frac{1}{n} \sum(lnx_i) = mean(lnx) = meanLog meanLogVec(i) = std::log(gMeanPredictor.Predict(datasetSize, k)); } - // Fit the distribution. + // Fit the distribution using the estimated and computed statistics. distancesDistribution.Train(logMeanVec, meanLogVec, meanVec); Timer::Stop("fitting_distributions"); // Step 7. Generate the Template Probing Sequence using the maximum number of - // projections and the maximum number of probes. +1 because 0 additional - // probes means 1 probe total. + // projections and the maximum number of probes. + // +1 because 0 additional probes means 1 probe total. GenerateTemplateSequence(numProj, numProbes + 1); // Step 8. Use formulas (19) and (20) from the paper to predict recall and diff --git a/src/mlpack/methods/lsh/objectivefunction.hpp b/src/mlpack/methods/lsh/objectivefunction.hpp index 6ddc51c8177..2aa60ab8054 100644 --- a/src/mlpack/methods/lsh/objectivefunction.hpp +++ b/src/mlpack/methods/lsh/objectivefunction.hpp @@ -28,6 +28,20 @@ class DefaultObjectiveFunction //! Default constructor - do nothing. DefaultObjectiveFunction() { }; + /** + * Parameterized constructor. + * + * @param xData Vector of x - the sizes of the reference set when performing + * kNN. + * @param kData Vector of k - the kth nearest neighbor for which we + * calculated the statistic. + * @param yData Matrix of y, one for each (x, k) value. + */ + DefaultObjectiveFunction(const arma::Col& xData, + const arma::Col& kData, + const arma::mat& yData) + : xData(&xData), kData(&kData), yData(&yData) { }; + //! Return the number of functions size_t NumFunctions(void) const { return yData->n_elem; } @@ -72,10 +86,10 @@ class DefaultObjectiveFunction double DefaultObjectiveFunction::Evaluate(const arma::mat& coordinates) const { // Use extra variables to make code readable. - const double alpha = coordinates(0, 0); - const double beta = coordinates(1, 0); - const double gamma = coordinates(2, 0); - const double M = (double) NumFunctions(); + double alpha = coordinates(0, 0); + double beta = coordinates(1, 0); + double gamma = coordinates(2, 0); + double M = (double) NumFunctions(); // Sum the squared error for each element in yData. double sum = 0; From f47e06938ce4b77907177aa5b3afc780a5a05b7f Mon Sep 17 00:00:00 2001 From: Ryan Curtin Date: Wed, 9 Nov 2016 13:54:03 -0500 Subject: [PATCH 17/18] (hopefully) Fix SameBucketProbability() and add some tests for it. I had to mark some private methods public to test these correctly, so maybe we should consider doing a little refactoring or redesign there, but I have not thought much about how. --- src/mlpack/methods/lsh/lshmodel.hpp | 3 + src/mlpack/methods/lsh/lshmodel_impl.hpp | 12 ++-- src/mlpack/tests/lsh_test.cpp | 79 ++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/src/mlpack/methods/lsh/lshmodel.hpp b/src/mlpack/methods/lsh/lshmodel.hpp index 5c250404bb7..06b0340e061 100644 --- a/src/mlpack/methods/lsh/lshmodel.hpp +++ b/src/mlpack/methods/lsh/lshmodel.hpp @@ -219,6 +219,7 @@ class LSHModel * want to compute the template perturbation sequence. * @param numProbes The number of probes to generate. */ + public: void GenerateTemplateSequence(size_t numProj, size_t numProbes); @@ -241,6 +242,7 @@ class LSHModel size_t numTables, size_t numProj, size_t numProbes) const; + /** * This is a helper function that is called by Rho() and returns the inner * value of the product used in the calculation of the probability that Rho @@ -257,6 +259,7 @@ class LSHModel short delta, size_t proj, size_t numProj) const; + private: /** * This function calculates the recall of LSH for a given set of parameters. diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp index 1a66377e03f..cb8e212ed0d 100644 --- a/src/mlpack/methods/lsh/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp @@ -349,15 +349,19 @@ SameBucketProbability(double chi, double hashWidth, short delta, size_t proj, if (delta == 0) { // No perturbation - probability of two queries sharing the same bin. - return 2 * pdf(phi, hashWidth / chi) - 1 - + std::sqrt(2 / M_PI) - * (std::exp(-pow((hashWidth / chi), 2) / 2.0 - 1.0)) / (hashWidth / chi); + // The derivation to come to this solution is... pretty intense. If you + // want to reproduce it, take equation (13) from the paper and expand it. + // Integrate the Gaussian PDF phi((z - x) / d), then take the expected value + // over z assuming a uniform distribution (so f(z) = 1/W). After some + // integration and algebraic simplification, you should come to the result + // below. + return 2 * cdf(phi, hashWidth / chi) - 1.0; } else { // +1/-1 perturbation - probability of two queries being in adjacent bins. double deltaI = (proj + 1.0) / (2.0 * (numProj + 2.0)); - + // Negative perturbation - flip deltaI. if (delta == -1) deltaI = 1 - deltaI; diff --git a/src/mlpack/tests/lsh_test.cpp b/src/mlpack/tests/lsh_test.cpp index 33485fce76d..64110d92e66 100644 --- a/src/mlpack/tests/lsh_test.cpp +++ b/src/mlpack/tests/lsh_test.cpp @@ -10,6 +10,7 @@ #include #include +#include using namespace std; using namespace mlpack; @@ -831,4 +832,82 @@ BOOST_AUTO_TEST_CASE(ParallelMonochromatic) } #endif +// Test that LSHModel::Rho() returns reasonable results. +BOOST_AUTO_TEST_CASE(RhoTest) +{ + arma::mat data(10, 1000, arma::fill::randu); + LSHModel<> m(data, 0.1, 3); + m.GenerateTemplateSequence(5, 5); + + // Two identical points should have high probability of being in the same bin. + for (double hw = 0.1; hw < 0.5; hw++) + BOOST_REQUIRE_CLOSE(m.Rho(0.0, hw, 5, 5, 5), 1.0, 1e-5); + + // Two very faraway points should have very small probability of being in the + // same bin. + for (double hw = 0.1; hw <= 0.5; hw++) + BOOST_REQUIRE_SMALL(m.Rho(5.0, hw, 5, 5, 5), 1e-5); +} + +// Test that LSHModel::SameBucketProbability() returns reasonable results when +// delta = 0. +BOOST_AUTO_TEST_CASE(SameBucketProbabilityDelta0Test) +{ + // Create a simple LSHModel. + arma::mat data(10, 100, arma::fill::randu); + LSHModel<> m(data, 0.1, 1); + + // When the points are far and the hash width is small the probability should + // be very close to 0. + BOOST_REQUIRE_SMALL(m.SameBucketProbability(1e6, 1e-2, 0, 5, 10), 1e-5); + + // When the points are close and the hash width is large the probability + // should be very close to 1. + BOOST_REQUIRE_CLOSE(m.SameBucketProbability(1e-2, 1e6, 0, 0, 3), 1.0, 1e-5); + + // For random points, the probability should be between 0 and 1. + for (size_t i = 0; i < 1000; ++i) + { + const double r = math::Random(); + + const double p = m.SameBucketProbability(r, 0.5, 0, 0, 3); + BOOST_REQUIRE_GE(p, 0.0); + BOOST_REQUIRE_LE(p, 1.0); + } +} + +// Test that LSHModel::SameBucketProbability() returns reasonable results when +// delta = 1 or delta = -1. +BOOST_AUTO_TEST_CASE(SameBucketProbabilityDelta1Test) +{ + // Create a simple LSHModel. + arma::mat data(10, 100, arma::fill::randu); + LSHModel<> m(data, 0.1, 1); + + // When the points are very far and the hash width is small the probability + // should be very close to 0, regardless of delta. + BOOST_REQUIRE_SMALL(m.SameBucketProbability(1e6, 1e-2, 1, 5, 10), 1e-5); + BOOST_REQUIRE_SMALL(m.SameBucketProbability(1e6, 1e-2, -1, 5, 10), 1e-5); + + // When the points are close(ish) and the hash width is large the probability + // should still be close to 0 because delta != 0 means the we are searching + // adjacent bins. + BOOST_REQUIRE_SMALL(m.SameBucketProbability(1e-2, 1e2, 1, 0, 3), 1e-5); + BOOST_REQUIRE_SMALL(m.SameBucketProbability(1e-2, 1e2, -1, 0, 3), 1e-5); + + // For random points, the probability should be between 0 and 1. + for (size_t i = 0; i < 1000; ++i) + { + const double r = math::Random(); + + const double p = m.SameBucketProbability(r, 0.5, 1, 0, 3); + BOOST_REQUIRE_GE(p, 0.0); + BOOST_REQUIRE_LE(p, 1.0); + + const double p2 = m.SameBucketProbability(r, 0.5, -1, 0, 3); + BOOST_REQUIRE_GE(p2, 0.0); + BOOST_REQUIRE_LE(p2, 1.0); + } +} + BOOST_AUTO_TEST_SUITE_END(); From f05b0e034fd8210a9dd6a94d6874e7fab2c4b145 Mon Sep 17 00:00:00 2001 From: mentekid Date: Sun, 20 Nov 2016 10:31:30 +0000 Subject: [PATCH 18/18] Switches to NAIVE_MODE for kNN --- src/mlpack/methods/lsh/lshmodel_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp index cb8e212ed0d..3c4a6680ca9 100644 --- a/src/mlpack/methods/lsh/lshmodel_impl.hpp +++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp @@ -152,7 +152,7 @@ void LSHModel::Train( arma::Mat neighbors; // Not going to be used but required. arma::mat kNNDistances; // What we need. - KNN naive(refMat, true); // true: train and use naive kNN. + KNN naive(refMat, NAIVE_MODE); // true: train and use naive kNN. naive.Search(queryMat, k, neighbors, kNNDistances); // Replace 0s again.