From 8381adc17ea07a73d3ce87e0f2b48a8baca18c15 Mon Sep 17 00:00:00 2001
From: Yannis Mentekidis <mentekid@gmail.com>
Date: Wed, 27 Jul 2016 11:36:34 +0100
Subject: [PATCH 01/18] Skeleton of Project

---
 src/mlpack/methods/CMakeLists.txt             |  1 +
 src/mlpack/methods/lsh/lsh_search.hpp         |  1 +
 src/mlpack/methods/lsh_model/CMakeLists.txt   | 19 +++++++
 src/mlpack/methods/lsh_model/lshmodel.hpp     | 55 +++++++++++++++++++
 .../methods/lsh_model/lshmodel_impl.hpp       |  1 +
 .../methods/lsh_model/lshmodel_main.cpp       | 15 +++++
 6 files changed, 92 insertions(+)
 create mode 100644 src/mlpack/methods/lsh_model/CMakeLists.txt
 create mode 100644 src/mlpack/methods/lsh_model/lshmodel.hpp
 create mode 100644 src/mlpack/methods/lsh_model/lshmodel_impl.hpp
 create mode 100644 src/mlpack/methods/lsh_model/lshmodel_main.cpp

diff --git a/src/mlpack/methods/CMakeLists.txt b/src/mlpack/methods/CMakeLists.txt
index dbbd2318bee..6a098340339 100644
--- a/src/mlpack/methods/CMakeLists.txt
+++ b/src/mlpack/methods/CMakeLists.txt
@@ -36,6 +36,7 @@ set(DIRS
   local_coordinate_coding
   logistic_regression
   lsh
+  lsh_model
 #  mvu
   matrix_completion
   naive_bayes
diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp
index 4e6cc97b3d9..62aa64b8542 100644
--- a/src/mlpack/methods/lsh/lsh_search.hpp
+++ b/src/mlpack/methods/lsh/lsh_search.hpp
@@ -1,6 +1,7 @@
 /**
  * @file lsh_search.hpp
  * @author Parikshit Ram
+ * @author Yannis Mentekidis
  *
  * Defines the LSHSearch class, which performs an approximate
  * nearest neighbor search for a queries in a query set
diff --git a/src/mlpack/methods/lsh_model/CMakeLists.txt b/src/mlpack/methods/lsh_model/CMakeLists.txt
new file mode 100644
index 00000000000..c3799753aec
--- /dev/null
+++ b/src/mlpack/methods/lsh_model/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Define the files we need to compile.
+# Anything not in this list will not be compiled into mlpack.
+set(SOURCES
+  # LSH-model class
+  lshmodel.hpp
+  lshmodel_impl.hpp
+)
+
+# Add directory name to sources.
+set(DIR_SRCS)
+foreach(file ${SOURCES})
+  set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
+endforeach()
+# Append sources (with directory name) to list of all mlpack sources (used at
+# the parent scope).
+set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
+
+# The code that models LSH to return a set of parameters that works well.
+add_cli_executable(lshmodel)
diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp
new file mode 100644
index 00000000000..073f9576a6b
--- /dev/null
+++ b/src/mlpack/methods/lsh_model/lshmodel.hpp
@@ -0,0 +1,55 @@
+/**
+ * @file lshmodel.hpp
+ * @author Yannis Mentekidis
+ *
+ * Defines the LSHModel class, which models the Locality Sensitive Hashing
+ * algorithm. The model identifies parameter sets that produce satisfactory
+ * results while keeping execution time low.
+ * 
+ * The model was proposed by Dong et al in the following paper.
+ *
+ * @code
+ * @article{Dong2008LSHModel,
+ *  author = {Dong, Wei and Wang, Zhe and Josephson, William and Charikar, 
+ *      Moses and Li, Kai},
+ *  title = {{Modeling LSH for performance tuning}},
+ *  journal = {Proceeding of the 17th ACM conference on Information and 
+ *      knowledge mining - CIKM '08},
+ *  pages = {669},
+ *  url = {http://portal.acm.org/citation.cfm?doid=1458082.1458172},
+ *  year = {2008}
+ * }
+ * @endcode
+ *
+ * We use a different method to fit Gamma Distributions to pairwise distances.
+ * Instead of the MLE method proposed in the paper above, we use the mlpack
+ * class GammaDistribution, which implements fitting according to Thomas Minka's
+ * work.
+ *
+ * @code
+ * @techreport{minka2002estimating,
+ *   title={Estimating a {G}amma distribution},
+ *   author={Minka, Thomas P.},
+ *   institution={Microsoft Research},
+ *   address={Cambridge, U.K.},
+ *   year={2002}
+ * }
+ * @endcode
+ */
+
+
+namespace mlpack {
+namespace neighbor {
+
+class LSHModel
+{
+ public:
+
+
+ private:
+
+
+}; // class LSHModel.
+
+} // namespace neighbor.
+} // namespace mlpack.
diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
new file mode 100644
index 00000000000..a267c74ae20
--- /dev/null
+++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
@@ -0,0 +1 @@
+#include "lshmodel.hpp"
diff --git a/src/mlpack/methods/lsh_model/lshmodel_main.cpp b/src/mlpack/methods/lsh_model/lshmodel_main.cpp
new file mode 100644
index 00000000000..e28d2644bbf
--- /dev/null
+++ b/src/mlpack/methods/lsh_model/lshmodel_main.cpp
@@ -0,0 +1,15 @@
+#include <mlpack/core.hpp>
+
+#include "lshmodel.hpp"
+using namespace mlpack;
+
+PROGRAM_INFO("LSH Model (TODO: Complete this)", "");
+
+PARAM_STRING_IN("reference_file", "File containing the dataset", "r", "");
+PARAM_STRING_OUT("output_model_file", "File to save trained LSH model to", "m");
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Hello!" << std::endl;
+  return 0;
+}

From aa6a123819944a88b1d501be33ce15a00feb966a Mon Sep 17 00:00:00 2001
From: Yannis Mentekidis <mentekid@gmail.com>
Date: Tue, 2 Aug 2016 14:44:08 +0100
Subject: [PATCH 02/18] Almost complete LSHModel skeleton

---
 src/mlpack/methods/lsh_model/lshmodel.hpp     | 239 ++++++++++++++++-
 .../methods/lsh_model/lshmodel_impl.hpp       | 242 ++++++++++++++++++
 .../methods/lsh_model/lshmodel_main.cpp       |  12 +-
 3 files changed, 489 insertions(+), 4 deletions(-)

diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp
index 073f9576a6b..c6ba93d4500 100644
--- a/src/mlpack/methods/lsh_model/lshmodel.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel.hpp
@@ -5,15 +5,15 @@
  * Defines the LSHModel class, which models the Locality Sensitive Hashing
  * algorithm. The model identifies parameter sets that produce satisfactory
  * results while keeping execution time low.
- * 
+ *
  * The model was proposed by Dong et al in the following paper.
  *
  * @code
  * @article{Dong2008LSHModel,
- *  author = {Dong, Wei and Wang, Zhe and Josephson, William and Charikar, 
+ *  author = {Dong, Wei and Wang, Zhe and Josephson, William and Charikar,
  *      Moses and Li, Kai},
  *  title = {{Modeling LSH for performance tuning}},
- *  journal = {Proceeding of the 17th ACM conference on Information and 
+ *  journal = {Proceeding of the 17th ACM conference on Information and
  *      knowledge mining - CIKM '08},
  *  pages = {669},
  *  url = {http://portal.acm.org/citation.cfm?doid=1458082.1458172},
@@ -37,19 +37,252 @@
  * @endcode
  */
 
+#ifndef MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_MODEL_HPP
+#define MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_MODEL_HPP
+
+// For returning LSHSearch objects.
+#include <mlpack/methods/lsh/lsh_search.hpp>
+#include <mlpack/methods/neighbor_search/neighbor_search.hpp>
+#include <mlpack/methods/neighbor_search/sort_policies/nearest_neighbor_sort.hpp>
 
 namespace mlpack {
 namespace neighbor {
 
+template <typename SortPolicy = NearestNeighborSort>
 class LSHModel
 {
  public:
 
+   //! Empty Constructor. Do nothing
+   LSHModel(){ /* Do nothing. */ };
+
+   /** Parameterized Constructor. This function initializes the object and
+    * trains it with the provided reference set.
+    *
+    * @param referenceSet The data that will be used as a reference set for LSH
+    *     to run queries against. We will fit distributions based on this data
+    *     and produce good parameters for it.
+    * @param minRecall The minimum recall we want to guarantee. The parameters
+    *     we will estimate will try to keep average recall of LSH above this.
+    *     Must be in [0, 1).
+    * @param sampleSize The percentage of the reference set to sample for the
+    *     estimation. Naive all-kNN will be run on this sample, so if it is too
+    *     big, training will be very slow. Must be in [0, 1)
+    * @param k The number of nearest neighbors wanted for each query.
+    */
+   LSHModel(
+       const arma::mat &referenceSet,
+       const double minRecall,
+       const double sampleSize,
+       const size_t k);
+
+   //! Destructor. If we own any memory, free it.
+   ~LSHModel();
+
+   /**
+    * Trains the LSHModel. Fits distributions using referenceSet and then looks
+    * for LSH parameters that would return recalls larger than minRecall in the
+    * lowest cost (selectivity) possible.
+    *
+    * The model can estimate good values for the parameters:
+    *   * numProj: Number of projections per projection table.
+    *   * numTables: Number of projection tables.
+    *   * hashWidth: Hash width of the LSH hash.
+    *   * numProbes: Number of probes for multiprobe LSH.
+    *
+    * Train stores the computed parameters in the LSHModel object's variables.
+    *
+    * @param referenceSet The data that will be used as a reference set for LSH
+    *     to run queries against. We will fit distributions based on this data
+    *     and produce good parameters for it.
+    * @param minRecall The minimum recall we want to guarantee. The parameters
+    *     we will estimate will try to keep average recall of LSH above this.
+    *     Must be in [0, 1).
+    * @param sampleSize The percentage of the reference set to sample for the
+    *     estimation. Naive all-kNN will be run on this sample, so if it is too
+    *     big, training will be very slow. Must be in [0, 1)
+    * @param k The number of nearest neighbors wanted for each query.
+    */
+   void Train(
+       const arma::mat &referenceSet,
+       const double minRecall,
+       const double sampleSize,
+       const size_t k);
+
+   /**
+    * This function returns an LSHSearch object trained with the parameters
+    * calculated when the LSHModel was trained.
+    * If any of the parameters we trained for (numProj, numTables, hashWidth)
+    * are specified, we will not used the trained but the provided parameters.
+    * If these are left to default (0), the estimated parameters will be used.
+    *
+    * @param numProjIn The number of projections per table.
+    * @param numTablesIn The number of projection tables.
+    * @param hashWidthIn The first level hash width.
+    * @param secondHashSize The second level hash width.
+    * @param bucketSize The second level bucket size.
+    */
+   LSHSearch<SortPolicy>* LSHObject(
+       const size_t numProjIn = 0,
+       const size_t numTablesIn = 0,
+       const double hashWidthIn = 0.0,
+       const size_t secondHashSize = 99901,
+       const size_t bucketSize = 500);
+
+   //! Return the number of projections calculated.
+   size_t NumProj(void) const { return numProj; };
+
+   //! Return the number of tables calculated.
+   size_t NumTables(void) const { return numTables; };
+
+   //! Return the calculated hash width.
+   double HashWidth(void) const { return hashWidth; };
+
+   //! Return the calculated number of probes.
+   double NumProbes(void) const { return numProbes; };
+
+   //! Return the reference set.
+   const arma::mat ReferenceSet(void) const {return *referenceSet; };
+
+   //! Serialize the LSHModel object.
+   template<typename Archive>
+   void Serialize(Archive& ar);
 
  private:
 
+   /**
+    * This is a helper class that uses the function a * k^b * N^c for some
+    * parameters a, b, c that have been fit to either predict the arithmetic or
+    * geometric mean of the squared distance of a point to its k-nearest
+    * neighbor, given some dataset size N.
+    */
+   class DistanceStatisticPredictor
+   {
+    public:
+      //! Empty constructor.
+      DistanceStatisticPredictor() { };
+
+      //! Construct with training set.
+      DistanceStatisticPredictor(const arma::Col<size_t>& inputSize, 
+                                 const arma::vec& statistic,
+                                 size_t k) 
+        : k(k)
+      { Train(inputSize, statistic); };
+      
+      //! Default destructor.
+      ~DistanceStatisticPredictor() { };
+
+      /**
+       * Function that fits the alpha, beta and gamma parameters.
+       *
+       * @param inputSize A vector of input sizes. The input variable of the
+       *     regression.
+       * @param statistic A vector of responses - the value of the statistic for
+       *     each given inputSize.
+       */
+      void Train(const arma::Col<size_t>& inputSize, const arma::vec& statistic);
+
+      /** 
+       * Evaluate the statistic for a given dataset size.
+       *
+       * @param N - a new input size for which to evaluate the expected
+       *     statistic.
+       */
+      double Predict(size_t N) 
+      { return alpha * std::pow(k, beta) * std::pow(N, gamma); };
+
+      //! Set the alpha parameter.
+      void Alpha(double a) { alpha = a; };
+
+      //! Get the alpha parameter.
+      double Alpha(void) { return alpha; };
+      
+      //! Set the beta parameter.
+      void Beta(double b) { beta = b; };
+
+      //! Get the beta parameter.
+      double Beta(void) { return beta; };
+
+      //! Set the gamma parameter.
+      void Gamma(double c) { gamma = c; };
+
+      //! Get the gamma parameter.
+      double Gamma(void) { return gamma; };
+
+      //! Set the k parameter.
+      void K(double kIn) { k = kIn; };
+
+      //! Get the k parameter.
+      double K(void) { return k; };
+
+    private:
+      double alpha;
+      double beta;
+      double gamma;
+      double k;
+   };
+
+   //! Vector of DistanceStatisticPredictors for arithmetic mean.
+   std::vector<DistanceStatisticPredictor> aMeanPredictors;
+
+   //! Vector of DistanceStatisticPredictors for geometric mean.
+   std::vector<DistanceStatisticPredictor> gMeanPredictors;
+   
+   /**
+    * Function that fits two DistanceStatisticPredictors for each k - one
+    * to predict arithmetic mean and one to preduct geometric mean.
+    *
+    * @param referenceSizes The number of reference points for each kNN search.
+    * @param Ek The arithmetic mean of the squared distances of a point and its
+    *      k-nearest neighbor. One column per k.
+    * @param Gk The geometric mean of the squared distances of a point and its
+    *      k-nearest neighbor. One column per k.
+    */
+   void ApproximateKNNStatistics(
+       arma::Col<size_t> referenceSizes, 
+       arma::mat Ek, 
+       arma::mat Gk);
+
+   //! Flag that tracks if we own the reference set.
+   bool ownsSet;
+
+   //! Flag that tracks if we own an LSHSearch object.
+   bool ownsLSHObject;
+
+   //! Number of projections per table.
+   size_t numProj;
+
+   //! Number of projection tables.
+   size_t numTables;
+
+   //! First-level hash width.
+   double hashWidth;
+
+   //! Number of probes for multiprobe LSH.
+   size_t numProbes;
+
+   //! Reference dataset.
+   const arma::mat* referenceSet;
+
+   //! Vector of LSHSearch objects.
+   std::vector< LSHSearch<SortPolicy> > lshObjectVector;
+
+   //! Statistic: average squared distance of points.
+   double meanDist;
+
+   //! Statistic: logarithm of squared distance of points.
+   double logMeanDist;
+
+   //! Statisitc: average of logarithm of squared distances of points.
+   double meanLogDist;
+
 
 }; // class LSHModel.
 
 } // namespace neighbor.
 } // namespace mlpack.
+
+// Include the class implementation.
+#include "lshmodel_impl.hpp"
+
+#endif
diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
index a267c74ae20..1df2ac0c3ad 100644
--- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
@@ -1 +1,243 @@
+/**
+ * @file lshmodel_impl.hpp
+ * @author Yannis Mentekidis
+ *
+ * Implementation of the LSHModel functions.
+ */
+#ifndef MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_MODEL_IMPL_HPP
+#define MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_MODEL_IMPL_HPP
+
 #include "lshmodel.hpp"
+
+
+//TODO: remove
+using std::cout;
+using std::flush;
+using std::endl;
+
+namespace mlpack {
+namespace neighbor {
+
+// Constructor sets variables and trains the object.
+template <typename SortPolicy>
+LSHModel<SortPolicy>::LSHModel(const arma::mat &referenceSet,
+                   const double minRecall,
+                   const double sampleSize,
+                   const size_t k)
+{
+  // We don't own the set - we just point to it.
+  ownsSet = false;
+  this->referenceSet = &referenceSet;
+
+  Train(referenceSet, minRecall, sampleSize, k);
+}
+
+// Destructor must de-allocate any referenceSet and LSHSearch objects we own.
+template <typename SortPolicy>
+LSHModel<SortPolicy>::~LSHModel()
+{
+  if (ownsSet)
+    delete referenceSet;
+};
+
+// Trains the object.
+template <typename SortPolicy>
+void LSHModel<SortPolicy>::Train(const arma::mat &referenceSet,
+                     const double minRecall,
+                     const double sampleSize,
+                     const size_t k)
+{
+  // TODO: Implement
+
+  // Sanity Check: Verify that recall and sampleSize are in [0, 1).
+  if (minRecall >= 1 || minRecall < 0)
+    throw std::runtime_error("Recall must be floating point number in [0, 1)");
+
+  if (sampleSize > 1 || sampleSize <= 0)
+    throw std::runtime_error(
+        "Sampling rate must be floating point number in (0, 1]");
+
+  const size_t numPoints = referenceSet.n_cols; // Points in original set.
+
+  // Step 1. Select a random sample of the dataset. We will work with only that
+  // sample.
+  arma::vec sampleHelper(referenceSet.n_cols, arma::fill::randu);
+
+  // Keep a sample of the dataset. Shuffle to be impartial (in case reference
+  // set is sorted).
+  arma::mat sampleSet = arma::shuffle(referenceSet.cols(
+        // We have uniformly random numbers in [0, 1], so we expect about
+        // N*sampleSize of them to be in [0, sampleSize).
+        arma::find(sampleHelper < sampleSize)
+        ));
+  const size_t numSamples = sampleSet.n_cols; // Points in sampled set.
+
+  Log::Info << "Sampled " << numSamples << " points to train with." << std::endl;
+
+  // Step 2. Compute all-vs-all distances of points in the sample.
+  // The distance matrix is symmetric, so we only compute elements above the
+  // diagonal. There are (N * (N - 1)) / 2 such elements.
+  Timer::Start("pairwise_distances");
+  arma::vec distances(numSamples * (numSamples - 1) / 2);
+  size_t d = 0; // Index of where to store next.
+  for (size_t i = 0; i < numSamples; ++i)
+    for (size_t j = i + 1; j < numSamples; ++j)
+      distances(d++) = metric::EuclideanDistance::Evaluate(
+          sampleSet.unsafe_col(i), sampleSet.unsafe_col(j));
+  Log::Info << "Computed " << d << " pointwise distances." << std::endl;
+  Timer::Stop("pairwise_distances");
+
+  // Step 3. Estimate statistics of these distances: log(mean(d)), mean(log(d)),
+  // mean(d).
+  distances = arma::pow(distances, 2);
+  meanDist = arma::mean(distances);
+  logMeanDist = std::log(meanDist);
+  meanLogDist = arma::mean(arma::log(distances));
+
+  // Step 4. Select a small part of the sample as 'anchor points'. Use the rest
+  // of the sample as the reference set. Find the k-Nearest Neighbors' distances
+  // from the anchor points for increasing portion of the reference set. Compute
+  // the arithmetic and geometric mean of distances from each anchor to its
+  // k-Nearest Neighbor.
+  // The geometric mean of N numbers is the Nth root of the product of the
+  // numbers. Through logarithmic properties though, this becomes computable
+  // through exponentiating the mean of the logarithms of x:
+  // mean(log(x)) = geometricmean(x).
+
+  // Number of samples to create for modeling the Gamma Distributions
+  size_t regressionExamples = 50; // TODO: parameter?
+  // Number of points to use as queries.
+  size_t numAnchors = (size_t) std::round(0.1 * numSamples);
+  arma::mat queryMat = sampleSet.cols(0, numAnchors - 1);
+  // Evenly spaced sample sizes.
+  arma::Col<size_t> referenceSizes = arma::conv_to< arma::Col<size_t> >::from(
+    arma::linspace(numAnchors, numSamples - numAnchors - 1,
+      regressionExamples));
+
+  // Statistics - Arithmetic and geometric means for growing reference set.
+  // Compute one of each for each k.
+  arma::mat Ek(regressionExamples, k);
+  arma::mat Gk(regressionExamples, k);
+
+  Timer::Start("neighbors_distances");
+  // For each referenceSize, calculate the kNN of the anchors
+  for (size_t i = 0; i < regressionExamples; ++i)
+  {
+    // TODO: Since we've already computed this, avoid calling kNN?
+    // Reference set for kNN
+    arma::mat refMat = sampleSet.cols(numAnchors, numAnchors + referenceSizes(i) );
+
+    arma::Mat<size_t> neighbors; // Not going to be used but required.
+    arma::mat kNNDistances; // What we need.
+    KNN naive(refMat, true); // true: train and use naive kNN.
+    naive.Search(queryMat, k, neighbors, kNNDistances);
+    kNNDistances = arma::pow(kNNDistances, 2);
+
+    // Compute Arithmetic and Geometric mean of the distances.
+    Ek.row(i) = arma::mean(kNNDistances.t());
+    Gk.row(i) = arma::exp(arma::mean(arma::log(kNNDistances.t()), 0));
+  }
+  Timer::Stop("neighbors_distances");
+
+  // Step 5. Model the arithmetic and geometric mean according to the paper.
+  // This will produce 6 parameters (aE, bE, cE, aG, bG, cG) for each value of k
+  // from 1 to the k specified by the user.
+  ApproximateKNNStatistics(referenceSizes, Ek, Gk);
+
+  // Step 6. Fit Gamma distributions to pairwise distances and kNN distances,
+  // generated or estimated in steps 3 and 5.
+
+  // Step 7. Run Binary search on parameter space to minimize selectivity while
+  // keeping recall above minimum.
+}
+
+// Fit two predictors for each k.
+template <typename SortPolicy>
+void LSHModel<SortPolicy>::ApproximateKNNStatistics(
+    arma::Col<size_t> referenceSizes,
+    arma::mat Ek,
+    arma::mat Gk)
+{
+  size_t k = Ek.n_cols;
+
+  // Clear vectors and set them to correct size.
+  aMeanPredictors.clear();
+  gMeanPredictors.clear();
+  aMeanPredictors.resize(k);
+  gMeanPredictors.resize(k);
+
+  // Fit two predictors per value of k.
+  for (size_t i = 0; i < k; ++i)
+  {
+    aMeanPredictors[i] = DistanceStatisticPredictor(
+        referenceSizes, Ek.col(i), i);
+    gMeanPredictors[i] = DistanceStatisticPredictor(
+        referenceSizes, Gk.col(i), i);
+  }
+}
+
+// Construct and return an LSH object.
+template <typename SortPolicy>
+LSHSearch<SortPolicy>* LSHModel<SortPolicy>::LSHObject(const size_t numProjIn,
+                                           const size_t numTablesIn,
+                                           const double hashWidthIn,
+                                           const size_t secondHashSize,
+                                           const size_t bucketSize)
+{
+  // Values for the object to be created with (specified by user or default).
+  size_t numProjOut = numProjIn;
+  size_t numTablesOut = numTablesIn;
+  double hashWidthOut = hashWidthIn;
+
+  // If not specified by user, set these to the ones we trained for.
+  if (numProjIn == 0)
+    numProjOut = this->numProj;
+
+  if (numTablesIn == 0)
+    numTablesOut = this->numTables;
+
+  if (hashWidthOut == 0.0)
+    hashWidthOut = this->hashWidth;
+
+  std::cout << *referenceSet;
+
+  //TODO This causes a bad_alloc... I'm doing something wrong with the
+  //referenceSet.
+  /*
+  // Construct an object and return it.
+  LSHSearch<>* lshObject = new LSHSearch<>(
+      *referenceSet, numProjOut, numTablesOut, hashWidthOut,
+      secondHashSize, bucketSize);
+  return lshObject;
+  */
+
+  LSHSearch<>* lshObject = new LSHSearch<>();
+  return lshObject;
+
+
+}
+
+// Fit a curve to the data provided.
+template<typename SortPolicy>
+void LSHModel<SortPolicy>::DistanceStatisticPredictor::Train(
+    const arma::Col<size_t>& inputSize,
+    const arma::vec& statistic)
+{
+  Log::Warn << "Not implemented yet! " << std::endl;
+
+  alpha = beta = gamma = 1;
+  beta++;
+  alpha+=2;
+}
+
+// Serialize the object and save to a file.
+template <typename SortPolicy>
+template<typename Archive>
+void LSHModel<SortPolicy>::Serialize(Archive& ar)
+{
+  //TODO: implement this.
+}
+} // Namespace neighbor.
+} // Namespace mlpack.
+
+#endif
diff --git a/src/mlpack/methods/lsh_model/lshmodel_main.cpp b/src/mlpack/methods/lsh_model/lshmodel_main.cpp
index e28d2644bbf..d5253a9a7e9 100644
--- a/src/mlpack/methods/lsh_model/lshmodel_main.cpp
+++ b/src/mlpack/methods/lsh_model/lshmodel_main.cpp
@@ -1,7 +1,10 @@
 #include <mlpack/core.hpp>
+#include <mlpack/methods/lsh/lsh_search.hpp>
 
 #include "lshmodel.hpp"
+
 using namespace mlpack;
+using namespace mlpack::neighbor;
 
 PROGRAM_INFO("LSH Model (TODO: Complete this)", "");
 
@@ -10,6 +13,13 @@ PARAM_STRING_OUT("output_model_file", "File to save trained LSH model to", "m");
 
 int main(int argc, char* argv[])
 {
-  std::cout << "Hello!" << std::endl;
+  CLI::ParseCommandLine(argc, argv);
+
+  // Generate a random point set.
+  size_t N = 5000;
+  size_t d = 10;
+  arma::mat A(d, N, arma::fill::randu);
+  LSHModel<> model(A, 0.7, 0.25, 2);
+
   return 0;
 }

From 8d7cb38ba8369e838891215f168a94f3cd7789bd Mon Sep 17 00:00:00 2001
From: Yannis Mentekidis <mentekid@gmail.com>
Date: Thu, 4 Aug 2016 16:02:22 +0100
Subject: [PATCH 03/18] Implements modeling of arithmetic and geometric mean of
 distances

---
 src/mlpack/methods/lsh_model/lshmodel.hpp     |  67 ++++---
 .../methods/lsh_model/lshmodel_impl.hpp       |  98 +++++-----
 .../methods/lsh_model/objectivefunction.hpp   | 171 ++++++++++++++++++
 3 files changed, 265 insertions(+), 71 deletions(-)
 create mode 100644 src/mlpack/methods/lsh_model/objectivefunction.hpp

diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp
index c6ba93d4500..55b4ae03479 100644
--- a/src/mlpack/methods/lsh_model/lshmodel.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel.hpp
@@ -42,13 +42,21 @@
 
 // For returning LSHSearch objects.
 #include <mlpack/methods/lsh/lsh_search.hpp>
+// For template parameters and kNN search (if nescessary).
 #include <mlpack/methods/neighbor_search/neighbor_search.hpp>
 #include <mlpack/methods/neighbor_search/sort_policies/nearest_neighbor_sort.hpp>
+// For curve fitting.
+#include <mlpack/core/optimizers/lbfgs/lbfgs.hpp>
+// Default objective function.
+#include "objectivefunction.hpp"
 
 namespace mlpack {
 namespace neighbor {
 
-template <typename SortPolicy = NearestNeighborSort>
+template <
+    typename SortPolicy = NearestNeighborSort,
+    typename ObjectiveFunction = DefaultObjectiveFunction
+      >
 class LSHModel
 {
  public:
@@ -154,7 +162,7 @@ class LSHModel
     * This is a helper class that uses the function a * k^b * N^c for some
     * parameters a, b, c that have been fit to either predict the arithmetic or
     * geometric mean of the squared distance of a point to its k-nearest
-    * neighbor, given some dataset size N.
+    * neighbor, given some dataset size N and its k-nearest neighbor.
     */
    class DistanceStatisticPredictor
    {
@@ -162,12 +170,20 @@ class LSHModel
       //! Empty constructor.
       DistanceStatisticPredictor() { };
 
-      //! Construct with training set.
+      /** 
+       * Function to construct with training set.
+       *
+       * @param inputSize A vector of input sizes. The first input variable of 
+       *     the regression.
+       * @param kValues A vector of k values. The second input variable of the
+       *     regression.
+       * @param statistic A vector of responses - the value of the statistic for
+       *     each given inputSize.
+       */
       DistanceStatisticPredictor(const arma::Col<size_t>& inputSize, 
-                                 const arma::vec& statistic,
-                                 size_t k) 
-        : k(k)
-      { Train(inputSize, statistic); };
+                                 const arma::Col<size_t>& kValues,
+                                 const arma::mat& statistic) 
+      { Train(inputSize, kValues, statistic); };
       
       //! Default destructor.
       ~DistanceStatisticPredictor() { };
@@ -175,12 +191,16 @@ class LSHModel
       /**
        * Function that fits the alpha, beta and gamma parameters.
        *
-       * @param inputSize A vector of input sizes. The input variable of the
+       * @param inputSize A vector of input sizes. The first input variable of 
+       *     the regression.
+       * @param kValues A vector of k values. The second input variable of the
        *     regression.
        * @param statistic A vector of responses - the value of the statistic for
        *     each given inputSize.
        */
-      void Train(const arma::Col<size_t>& inputSize, const arma::vec& statistic);
+      double Train(const arma::Col<size_t>& inputSize, 
+                 const arma::Col<size_t>& kValues,
+                 const arma::mat& statistic);
 
       /** 
        * Evaluate the statistic for a given dataset size.
@@ -188,7 +208,7 @@ class LSHModel
        * @param N - a new input size for which to evaluate the expected
        *     statistic.
        */
-      double Predict(size_t N) 
+      double Predict(size_t N, size_t k) 
       { return alpha * std::pow(k, beta) * std::pow(N, gamma); };
 
       //! Set the alpha parameter.
@@ -209,39 +229,36 @@ class LSHModel
       //! Get the gamma parameter.
       double Gamma(void) { return gamma; };
 
-      //! Set the k parameter.
-      void K(double kIn) { k = kIn; };
-
-      //! Get the k parameter.
-      double K(void) { return k; };
 
     private:
       double alpha;
       double beta;
       double gamma;
-      double k;
    };
 
-   //! Vector of DistanceStatisticPredictors for arithmetic mean.
-   std::vector<DistanceStatisticPredictor> aMeanPredictors;
+   //! DistanceStatisticPredictor for arithmetic mean.
+   DistanceStatisticPredictor aMeanPredictor;
 
-   //! Vector of DistanceStatisticPredictors for geometric mean.
-   std::vector<DistanceStatisticPredictor> gMeanPredictors;
+   //! DistanceStatisticPredictor for geometric mean.
+   DistanceStatisticPredictor gMeanPredictor;
    
    /**
-    * Function that fits two DistanceStatisticPredictors for each k - one
+    * Function that fits two DistanceStatisticPredictors - one
     * to predict arithmetic mean and one to preduct geometric mean.
     *
     * @param referenceSizes The number of reference points for each kNN search.
+    * @param kValues The rank of the neighbors used for the statistic, for
+    *     example k = 5 means Ek is the arithmetic mean of the 5th-nearest
+    *     neighbor for different sample sizes.
     * @param Ek The arithmetic mean of the squared distances of a point and its
     *      k-nearest neighbor. One column per k.
     * @param Gk The geometric mean of the squared distances of a point and its
     *      k-nearest neighbor. One column per k.
     */
-   void ApproximateKNNStatistics(
-       arma::Col<size_t> referenceSizes, 
-       arma::mat Ek, 
-       arma::mat Gk);
+   void ApproximateKNNStatistics(const arma::Col<size_t>& referenceSizes, 
+                                 const arma::Col<size_t>& kValues,
+                                 const arma::mat& Ek, 
+                                 const arma::mat& Gk);
 
    //! Flag that tracks if we own the reference set.
    bool ownsSet;
diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
index 1df2ac0c3ad..3fdc04a73bb 100644
--- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
@@ -19,8 +19,8 @@ namespace mlpack {
 namespace neighbor {
 
 // Constructor sets variables and trains the object.
-template <typename SortPolicy>
-LSHModel<SortPolicy>::LSHModel(const arma::mat &referenceSet,
+template <typename SortPolicy, typename ObjectiveFunction>
+LSHModel<SortPolicy, ObjectiveFunction>::LSHModel(const arma::mat &referenceSet,
                    const double minRecall,
                    const double sampleSize,
                    const size_t k)
@@ -33,16 +33,16 @@ LSHModel<SortPolicy>::LSHModel(const arma::mat &referenceSet,
 }
 
 // Destructor must de-allocate any referenceSet and LSHSearch objects we own.
-template <typename SortPolicy>
-LSHModel<SortPolicy>::~LSHModel()
+template <typename SortPolicy, typename ObjectiveFunction>
+LSHModel<SortPolicy, ObjectiveFunction>::~LSHModel()
 {
   if (ownsSet)
     delete referenceSet;
 };
 
 // Trains the object.
-template <typename SortPolicy>
-void LSHModel<SortPolicy>::Train(const arma::mat &referenceSet,
+template <typename SortPolicy, typename ObjectiveFunction>
+void LSHModel<SortPolicy, ObjectiveFunction>::Train(const arma::mat &referenceSet,
                      const double minRecall,
                      const double sampleSize,
                      const size_t k)
@@ -106,6 +106,7 @@ void LSHModel<SortPolicy>::Train(const arma::mat &referenceSet,
 
   // Number of samples to create for modeling the Gamma Distributions
   size_t regressionExamples = 50; // TODO: parameter?
+
   // Number of points to use as queries.
   size_t numAnchors = (size_t) std::round(0.1 * numSamples);
   arma::mat queryMat = sampleSet.cols(0, numAnchors - 1);
@@ -140,9 +141,10 @@ void LSHModel<SortPolicy>::Train(const arma::mat &referenceSet,
   Timer::Stop("neighbors_distances");
 
   // Step 5. Model the arithmetic and geometric mean according to the paper.
-  // This will produce 6 parameters (aE, bE, cE, aG, bG, cG) for each value of k
-  // from 1 to the k specified by the user.
-  ApproximateKNNStatistics(referenceSizes, Ek, Gk);
+  // This will produce 6 parameters (aE, bE, cE, aG, bG, cG).
+  // Vector of k values.
+  arma::Col<size_t> kValues = arma::linspace<arma::Col<size_t>>(1, k, k);
+  ApproximateKNNStatistics(referenceSizes, kValues, Ek, Gk);
 
   // Step 6. Fit Gamma distributions to pairwise distances and kNN distances,
   // generated or estimated in steps 3 and 5.
@@ -151,38 +153,30 @@ void LSHModel<SortPolicy>::Train(const arma::mat &referenceSet,
   // keeping recall above minimum.
 }
 
-// Fit two predictors for each k.
-template <typename SortPolicy>
-void LSHModel<SortPolicy>::ApproximateKNNStatistics(
-    arma::Col<size_t> referenceSizes,
-    arma::mat Ek,
-    arma::mat Gk)
+// Fit two predictors, one for arithmetic mean E and one for geometric mean G.
+template <typename SortPolicy, typename ObjectiveFunction>
+void LSHModel<SortPolicy, ObjectiveFunction>::
+ApproximateKNNStatistics(const arma::Col<size_t>& referenceSizes,
+                         const arma::Col<size_t>& kValues,
+                         const arma::mat& Ek,
+                         const arma::mat& Gk)
 {
-  size_t k = Ek.n_cols;
-
-  // Clear vectors and set them to correct size.
-  aMeanPredictors.clear();
-  gMeanPredictors.clear();
-  aMeanPredictors.resize(k);
-  gMeanPredictors.resize(k);
-
-  // Fit two predictors per value of k.
-  for (size_t i = 0; i < k; ++i)
-  {
-    aMeanPredictors[i] = DistanceStatisticPredictor(
-        referenceSizes, Ek.col(i), i);
-    gMeanPredictors[i] = DistanceStatisticPredictor(
-        referenceSizes, Gk.col(i), i);
-  }
+  double aError = aMeanPredictor.Train(referenceSizes, kValues, Ek);
+  Log::Info << "L_BFGS Converged for arithmetic mean with error "
+    << aError << "." << std::endl;
+  double gError = gMeanPredictor.Train(referenceSizes, kValues, Gk);
+  Log::Info << "L_BFGS Converged for geometric mean with error "
+    << gError << "." << std::endl;
 }
 
 // Construct and return an LSH object.
-template <typename SortPolicy>
-LSHSearch<SortPolicy>* LSHModel<SortPolicy>::LSHObject(const size_t numProjIn,
-                                           const size_t numTablesIn,
-                                           const double hashWidthIn,
-                                           const size_t secondHashSize,
-                                           const size_t bucketSize)
+template <typename SortPolicy, typename ObjectiveFunction>
+LSHSearch<SortPolicy>* LSHModel<SortPolicy, ObjectiveFunction>::
+LSHObject(const size_t numProjIn,
+          const size_t numTablesIn,
+          const double hashWidthIn,
+          const size_t secondHashSize,
+          const size_t bucketSize)
 {
   // Values for the object to be created with (specified by user or default).
   size_t numProjOut = numProjIn;
@@ -218,22 +212,34 @@ LSHSearch<SortPolicy>* LSHModel<SortPolicy>::LSHObject(const size_t numProjIn,
 }
 
 // Fit a curve to the data provided.
-template<typename SortPolicy>
-void LSHModel<SortPolicy>::DistanceStatisticPredictor::Train(
+template<typename SortPolicy, typename ObjectiveFunction>
+double LSHModel<SortPolicy, ObjectiveFunction>::DistanceStatisticPredictor::Train(
     const arma::Col<size_t>& inputSize,
-    const arma::vec& statistic)
+    const arma::Col<size_t>& kValues,
+    const arma::mat& statistic)
 {
-  Log::Warn << "Not implemented yet! " << std::endl;
+  // Objective function for fitting the E(x, k) curve to the statistic.
+  ObjectiveFunction f(inputSize, kValues, statistic);
+
+  // Optimizer. Use L_BFGS (TODO: Make this a template parameter?)
+  mlpack::optimization::L_BFGS<ObjectiveFunction> opt(f);
+
+  // Get an initial point from the optimizer.
+  arma::mat currentPoint = f.GetInitialPoint();
+  double result = opt.Optimize(currentPoint);
+
+  // Optimizer is done - set alpha, beta, gamma.
+  this->alpha = currentPoint(0, 0);
+  this->beta = currentPoint(1, 0);
+  this->gamma = currentPoint(2, 0);
 
-  alpha = beta = gamma = 1;
-  beta++;
-  alpha+=2;
+  return result;
 }
 
 // Serialize the object and save to a file.
-template <typename SortPolicy>
+template <typename SortPolicy, typename ObjectiveFunction>
 template<typename Archive>
-void LSHModel<SortPolicy>::Serialize(Archive& ar)
+void LSHModel<SortPolicy, ObjectiveFunction>::Serialize(Archive& ar)
 {
   //TODO: implement this.
 }
diff --git a/src/mlpack/methods/lsh_model/objectivefunction.hpp b/src/mlpack/methods/lsh_model/objectivefunction.hpp
new file mode 100644
index 00000000000..2aa60ab8054
--- /dev/null
+++ b/src/mlpack/methods/lsh_model/objectivefunction.hpp
@@ -0,0 +1,171 @@
+/**
+ * @file objectivefunction.hpp
+ * @author Yannis Mentekidis
+ *
+ * This file implements a class that describes an objective function for
+ * minimization. It is used by the LSH model to fit a curve of the form
+ * E(k, N) = \alpha \cdot k ^ \beta \cdot N^\gamma
+ * to a certain statistic E, which can be either the arithmetic or the geometric
+ * mean of distances of a random point and its k-Nearest Neighbors.
+ *
+ * The objective function to minimize is the mean squared error (MSE):
+ *
+ * Error =\sum_{i=0}^{M} (y(i) - \alpha \cdot k ^ \beta \cdot N^\gamma)^2
+ *
+ * The class is designed for use with the L_BFGS optimizer, which is what the
+ * lshmodel class uses.
+ */
+
+#ifndef MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_DEFAULT_OBJECTIVE_FUNCTION_HPP
+#define MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_DEFAULT_OBJECTIVE_FUNCTION_HPP
+
+namespace mlpack {
+namespace neighbor {
+
+class DefaultObjectiveFunction
+{
+  public:
+    //! Default constructor - do nothing.
+    DefaultObjectiveFunction() { };
+
+    /**
+     * Parameterized constructor.
+     *
+     * @param xData Vector of x - the sizes of the reference set when performing
+     *    kNN.
+     * @param kData Vector of k - the kth nearest neighbor for which we
+     *    calculated the statistic.
+     * @param yData Matrix of y, one for each (x, k) value.
+     */
+    DefaultObjectiveFunction(const arma::Col<size_t>& xData, 
+                             const arma::Col<size_t>& kData, 
+                             const arma::mat& yData)
+      : xData(&xData), kData(&kData), yData(&yData)  { };
+
+    //! Return the number of functions
+    size_t NumFunctions(void) const { return yData->n_elem; }
+
+    //! Return a random starting point.
+    arma::mat GetInitialPoint() const 
+    { return arma::mat(3, 1, arma::fill::randu); }
+
+    /**
+     * This function evaluates the objective (MSE) at some coordinates with 
+     * some index.
+     * Called by the optimizer.
+     *
+     * @param coordinates Input matrix of coordinates. 
+     */
+    double Evaluate(const arma::mat& coordinates) const;
+
+    /**
+     * This function evaluates the gradient at some coordinates with some index.
+     * Called by the optimizer.
+     *
+     * @param coordinates Input matrix of coordinates.
+     * @param gradient Output matrix of gradients for each dimension of the
+     *    surface
+     */
+    void Gradient(const arma::mat& coordinates, 
+                  arma::mat& gradient) const;
+
+  private:
+    //! Data points for x-axis.
+    const arma::Col<size_t>* xData;
+    //! Data points for k-axis.
+    const arma::Col<size_t>* kData;
+    //! Data points for y-axis.
+    const arma::mat* yData;
+};
+
+/**
+ * Returns the value of the objective function for some coordinates (alpha,
+ * beta, gamma).
+ * This is the mean squared error for the current parameters or coordinates.
+ */
+double DefaultObjectiveFunction::Evaluate(const arma::mat& coordinates) const
+{
+  // Use extra variables to make code readable.
+  double alpha = coordinates(0, 0);
+  double beta = coordinates(1, 0);
+  double gamma = coordinates(2, 0);
+  double M = (double) NumFunctions();
+
+  // Sum the squared error for each element in yData.
+  double sum = 0;
+  for (size_t i = 0; i < yData->n_elem; ++i)
+  {
+    // Map i to (row, col). Columnwise access of yData.
+    size_t row = i % yData->n_rows;
+    size_t col = (size_t) (i / yData->n_rows); // Integer division (floor).
+
+    // Get the corresponding values.
+    size_t x = (*xData)(row);
+    size_t k = (*kData)(col);
+    double y = (*yData)(row, col);
+
+    // Evaluate (y - a * k ^ b * x ^ c)^2 for the given (x, y) pair.
+    sum += pow(y - alpha * std::pow(k, beta) * std::pow(x, gamma), 2); 
+  }
+
+  // Return the mean of the squared errors.
+  return sum / M;
+}
+
+/**
+ * Stores the gradient of the objective function in gradient. This is the
+ * derivative with respect to (alpha, beta, gamma) evaluated at the current
+ * parameters.
+ */
+void DefaultObjectiveFunction::Gradient(const arma::mat& coordinates,
+                                        arma::mat& gradient) const
+{
+  // Use extra variables to make code readable.
+  double alpha = coordinates(0, 0);
+  double beta = coordinates(1, 0);
+  double gamma = coordinates(2, 0);
+  double M = (double) NumFunctions();
+
+  // Allocate 3x1 matrix for gradient. Set all gradients to 0.
+  gradient.set_size(3, 1);
+  gradient.zeros(3,1);
+
+  // Sum each gradient.
+  for (size_t i = 0; i < yData->n_elem; ++i)
+  {
+    size_t row = i % yData->n_rows;
+    size_t col = (size_t) (i / yData->n_rows); // Integer division.
+    size_t x = (*xData)(row);
+    size_t k = (*kData)(col);
+    double y = (*yData)(row, col);
+
+    // The error for these parameters. Precompute for efficiency.
+    double error = (y - alpha * std::pow(k, beta) * std::pow(x, gamma));
+
+    // The chain rule factor of the product, for each gradient dimension.
+    double alphaChain = 
+      - 2.0 * std::pow(k, beta) * std::pow(x, gamma);
+
+    double betaChain = 
+      - 2.0 * alpha * std::pow(x, gamma) * std::log(k) * std::pow(k, beta);
+
+    double gammaChain =
+      - 2.0 * alpha * std::pow(k, beta) * std::log(x) * std::pow(x, gamma);
+
+    // 3x1 column vector (in matrix form).
+    gradient(0, 0) += error * alphaChain;
+    gradient(1, 0) += error * betaChain;
+    gradient(2, 0) += error * gammaChain;
+  }
+
+  // Return the average of each gradient after the summation is complete.
+  gradient(0, 0) /= ((double) M);
+  gradient(1, 0) /= ((double) M);
+  gradient(2, 0) /= ((double) M);
+}
+
+} // namespace neighbor
+} // namespace mlpack
+
+#endif
+

From 29b6eaadfca44102f23edd889012f0ef57dbd762 Mon Sep 17 00:00:00 2001
From: Yannis Mentekidis <mentekid@gmail.com>
Date: Thu, 4 Aug 2016 17:22:32 +0100
Subject: [PATCH 04/18] Adds code that returns trained LSH objects

---
 src/mlpack/methods/lsh_model/lshmodel.hpp      |  6 +++---
 src/mlpack/methods/lsh_model/lshmodel_impl.hpp | 18 ++++--------------
 src/mlpack/methods/lsh_model/lshmodel_main.cpp | 10 ++++++++--
 3 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp
index 55b4ae03479..e0d8299317c 100644
--- a/src/mlpack/methods/lsh_model/lshmodel.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel.hpp
@@ -130,7 +130,7 @@ class LSHModel
     * @param secondHashSize The second level hash width.
     * @param bucketSize The second level bucket size.
     */
-   LSHSearch<SortPolicy>* LSHObject(
+   LSHSearch<SortPolicy> LSHObject(
        const size_t numProjIn = 0,
        const size_t numTablesIn = 0,
        const double hashWidthIn = 0.0,
@@ -281,8 +281,8 @@ class LSHModel
    //! Reference dataset.
    const arma::mat* referenceSet;
 
-   //! Vector of LSHSearch objects.
-   std::vector< LSHSearch<SortPolicy> > lshObjectVector;
+   //! LSHSearch Object Vector.
+   std::vector<LSHSearch<SortPolicy>> lshObjectVector;
 
    //! Statistic: average squared distance of points.
    double meanDist;
diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
index 3fdc04a73bb..3d21843cabb 100644
--- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
@@ -171,7 +171,7 @@ ApproximateKNNStatistics(const arma::Col<size_t>& referenceSizes,
 
 // Construct and return an LSH object.
 template <typename SortPolicy, typename ObjectiveFunction>
-LSHSearch<SortPolicy>* LSHModel<SortPolicy, ObjectiveFunction>::
+LSHSearch<SortPolicy> LSHModel<SortPolicy, ObjectiveFunction>::
 LSHObject(const size_t numProjIn,
           const size_t numTablesIn,
           const double hashWidthIn,
@@ -193,22 +193,12 @@ LSHObject(const size_t numProjIn,
   if (hashWidthOut == 0.0)
     hashWidthOut = this->hashWidth;
 
-  std::cout << *referenceSet;
-
-  //TODO This causes a bad_alloc... I'm doing something wrong with the
-  //referenceSet.
-  /*
-  // Construct an object and return it.
-  LSHSearch<>* lshObject = new LSHSearch<>(
-      *referenceSet, numProjOut, numTablesOut, hashWidthOut,
+  LSHSearch<> lsh(*referenceSet, numProjOut, numTablesOut, hashWidthOut,
       secondHashSize, bucketSize);
-  return lshObject;
-  */
-
-  LSHSearch<>* lshObject = new LSHSearch<>();
-  return lshObject;
 
+  lshObjectVector.push_back(lsh);
 
+  return lshObjectVector[lshObjectVector.size() - 1];
 }
 
 // Fit a curve to the data provided.
diff --git a/src/mlpack/methods/lsh_model/lshmodel_main.cpp b/src/mlpack/methods/lsh_model/lshmodel_main.cpp
index d5253a9a7e9..7192ce8ff96 100644
--- a/src/mlpack/methods/lsh_model/lshmodel_main.cpp
+++ b/src/mlpack/methods/lsh_model/lshmodel_main.cpp
@@ -18,8 +18,14 @@ int main(int argc, char* argv[])
   // Generate a random point set.
   size_t N = 5000;
   size_t d = 10;
-  arma::mat A(d, N, arma::fill::randu);
-  LSHModel<> model(A, 0.7, 0.25, 2);
+  arma::mat rdata(d, N, arma::fill::randu);
+  LSHModel<> model(rdata, 0.7, 0.25, 2);
+
+  arma::mat qdata(d, 1, arma::fill::randu);
+  arma::Mat<size_t> neighbors;
+  arma::mat distances;
+  LSHSearch<> lsh = model.LSHObject(1, 1, 1.0, 99901, 500);
+  lsh.Search(qdata, 1, neighbors, distances);
 
   return 0;
 }

From dd8f5f37263ac83414128599ded4a2939439317c Mon Sep 17 00:00:00 2001
From: Yannis Mentekidis <mentekid@gmail.com>
Date: Fri, 5 Aug 2016 11:14:12 +0100
Subject: [PATCH 05/18] attempt to fix arma::shuffle call causes travis build
 failure.

---
 src/mlpack/methods/lsh_model/lshmodel_impl.hpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
index 3d21843cabb..b400d1cc228 100644
--- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
@@ -63,13 +63,12 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(const arma::mat &referenceSe
   // sample.
   arma::vec sampleHelper(referenceSet.n_cols, arma::fill::randu);
 
-  // Keep a sample of the dataset. Shuffle to be impartial (in case reference
-  // set is sorted).
-  arma::mat sampleSet = arma::shuffle(referenceSet.cols(
-        // We have uniformly random numbers in [0, 1], so we expect about
-        // N*sampleSize of them to be in [0, sampleSize).
-        arma::find(sampleHelper < sampleSize)
-        ));
+  // Keep a sample of the dataset: We have uniformly random numbers in [0, 1],
+  // so we expect about N*sampleSize of them to be in [0, sampleSize).
+  arma::mat sampleSet = referenceSet.cols(
+        arma::find(sampleHelper < sampleSize));
+  // Shuffle to be impartial (in case dataset is sorted in some way).
+  sampleSet = arma::shuffle(sampleSet);
   const size_t numSamples = sampleSet.n_cols; // Points in sampled set.
 
   Log::Info << "Sampled " << numSamples << " points to train with." << std::endl;

From 009a4acf3ee417dbae2eb2de4ea91a3caa696818 Mon Sep 17 00:00:00 2001
From: Yannis Mentekidis <mentekid@gmail.com>
Date: Fri, 5 Aug 2016 15:22:07 +0100
Subject: [PATCH 06/18] Completes Train(), adds Predict()

---
 src/mlpack/methods/lsh_model/lshmodel.hpp     | 101 +++++++++--------
 .../methods/lsh_model/lshmodel_impl.hpp       | 103 +++++++++++++-----
 .../methods/lsh_model/lshmodel_main.cpp       |   6 +-
 3 files changed, 137 insertions(+), 73 deletions(-)

diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp
index e0d8299317c..2890b0a91ce 100644
--- a/src/mlpack/methods/lsh_model/lshmodel.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel.hpp
@@ -49,6 +49,8 @@
 #include <mlpack/core/optimizers/lbfgs/lbfgs.hpp>
 // Default objective function.
 #include "objectivefunction.hpp"
+// Gamma distribution for modeling squared distances.
+#include <mlpack/core/dists/gamma_distribution.hpp>
 
 namespace mlpack {
 namespace neighbor {
@@ -62,7 +64,7 @@ class LSHModel
  public:
 
    //! Empty Constructor. Do nothing
-   LSHModel(){ /* Do nothing. */ };
+   LSHModel(){ referenceSet = NULL; };
 
    /** Parameterized Constructor. This function initializes the object and
     * trains it with the provided reference set.
@@ -70,9 +72,6 @@ class LSHModel
     * @param referenceSet The data that will be used as a reference set for LSH
     *     to run queries against. We will fit distributions based on this data
     *     and produce good parameters for it.
-    * @param minRecall The minimum recall we want to guarantee. The parameters
-    *     we will estimate will try to keep average recall of LSH above this.
-    *     Must be in [0, 1).
     * @param sampleSize The percentage of the reference set to sample for the
     *     estimation. Naive all-kNN will be run on this sample, so if it is too
     *     big, training will be very slow. Must be in [0, 1)
@@ -80,7 +79,6 @@ class LSHModel
     */
    LSHModel(
        const arma::mat &referenceSet,
-       const double minRecall,
        const double sampleSize,
        const size_t k);
 
@@ -88,34 +86,44 @@ class LSHModel
    ~LSHModel();
 
    /**
-    * Trains the LSHModel. Fits distributions using referenceSet and then looks
-    * for LSH parameters that would return recalls larger than minRecall in the
-    * lowest cost (selectivity) possible.
+    * Trains the LSHModel. Train() uses a sample that is sampleRate * |N| to
+    * estimate parameters of the dataset. The estimated parameters are:
+    *   * Arithmetic mean of pairwise distances of random points in the sample.
+    *   * Geometric mean for the pairwise distnaces
+    *   * Arithmetic mean of distance random point to its k-th nearest neighbor 
+    *       as a function of |N|, the number of points.
+    *   * Geometric mean of the same distance.
     *
-    * The model can estimate good values for the parameters:
-    *   * numProj: Number of projections per projection table.
-    *   * numTables: Number of projection tables.
-    *   * hashWidth: Hash width of the LSH hash.
-    *   * numProbes: Number of probes for multiprobe LSH.
-    *
-    * Train stores the computed parameters in the LSHModel object's variables.
+    * Train() does not find LSH Parameters - it only estimates the dataset
+    * parameters. You have to call Predict() to find LSH Parameters.
     *
     * @param referenceSet The data that will be used as a reference set for LSH
     *     to run queries against. We will fit distributions based on this data
     *     and produce good parameters for it.
-    * @param minRecall The minimum recall we want to guarantee. The parameters
-    *     we will estimate will try to keep average recall of LSH above this.
-    *     Must be in [0, 1).
-    * @param sampleSize The percentage of the reference set to sample for the
+    * @param sampleRate The percentage of the reference set to sample for the
     *     estimation. Naive all-kNN will be run on this sample, so if it is too
     *     big, training will be very slow. Must be in [0, 1)
-    * @param k The number of nearest neighbors wanted for each query.
+    * @param maxKValue The maximum number of nearest neighbors for each query to
+    *     train for.
     */
-   void Train(
-       const arma::mat &referenceSet,
-       const double minRecall,
-       const double sampleSize,
-       const size_t k);
+   void Train(const arma::mat& referenceSet, 
+              const double sampleRate = 0.1,
+              const size_t maxKValue = 32);
+
+   /**
+    * Predict() finds LSH parameters that should work well for the dataset the 
+    * LSHModel was trained for. 
+    * Warning: If the k specified is larger than the maxKValue passed to
+    * Train(), Train() will be called again. This might have adverse effects to
+    * performance.
+    *
+    * @param datasetSize The size of the dataset that will be used.
+    * @param k The number of k-nearest neighbors LSH must find.
+    * @param minRecall The minimum acceptable recall we want to tune for.
+    */
+   void Predict(const size_t datasetSize, 
+                const size_t k, 
+                const double minRecall);
 
    /**
     * This function returns an LSHSearch object trained with the parameters
@@ -157,6 +165,23 @@ class LSHModel
    void Serialize(Archive& ar);
 
  private:
+   /**
+    * Function that fits two DistanceStatisticPredictors - one
+    * to predict arithmetic mean and one to preduct geometric mean.
+    *
+    * @param referenceSizes The number of reference points for each kNN search.
+    * @param kValues The rank of the neighbors used for the statistic, for
+    *     example k = 5 means Ek is the arithmetic mean of the 5th-nearest
+    *     neighbor for different sample sizes.
+    * @param Ek The arithmetic mean of the squared distances of a point and its
+    *      k-nearest neighbor. One column per k.
+    * @param Gk The geometric mean of the squared distances of a point and its
+    *      k-nearest neighbor. One column per k.
+    */
+   void ApproximateKNNStatistics(const arma::Col<size_t>& referenceSizes, 
+                                 const arma::Col<size_t>& kValues,
+                                 const arma::mat& Ek, 
+                                 const arma::mat& Gk);
 
    /**
     * This is a helper class that uses the function a * k^b * N^c for some
@@ -229,7 +254,6 @@ class LSHModel
       //! Get the gamma parameter.
       double Gamma(void) { return gamma; };
 
-
     private:
       double alpha;
       double beta;
@@ -241,30 +265,15 @@ class LSHModel
 
    //! DistanceStatisticPredictor for geometric mean.
    DistanceStatisticPredictor gMeanPredictor;
-   
-   /**
-    * Function that fits two DistanceStatisticPredictors - one
-    * to predict arithmetic mean and one to preduct geometric mean.
-    *
-    * @param referenceSizes The number of reference points for each kNN search.
-    * @param kValues The rank of the neighbors used for the statistic, for
-    *     example k = 5 means Ek is the arithmetic mean of the 5th-nearest
-    *     neighbor for different sample sizes.
-    * @param Ek The arithmetic mean of the squared distances of a point and its
-    *      k-nearest neighbor. One column per k.
-    * @param Gk The geometric mean of the squared distances of a point and its
-    *      k-nearest neighbor. One column per k.
-    */
-   void ApproximateKNNStatistics(const arma::Col<size_t>& referenceSizes, 
-                                 const arma::Col<size_t>& kValues,
-                                 const arma::mat& Ek, 
-                                 const arma::mat& Gk);
+
+   //! (k+1)-dimensional gamma distribution for predicting squared distances.
+   mlpack::distribution::GammaDistribution distancesDistribution;
 
    //! Flag that tracks if we own the reference set.
    bool ownsSet;
 
-   //! Flag that tracks if we own an LSHSearch object.
-   bool ownsLSHObject;
+   //! Maximum k value the object is trained for.
+   size_t maxKValue;
 
    //! Number of projections per table.
    size_t numProj;
diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
index b400d1cc228..50bf1869e12 100644
--- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
@@ -20,16 +20,16 @@ namespace neighbor {
 
 // Constructor sets variables and trains the object.
 template <typename SortPolicy, typename ObjectiveFunction>
-LSHModel<SortPolicy, ObjectiveFunction>::LSHModel(const arma::mat &referenceSet,
-                   const double minRecall,
-                   const double sampleSize,
-                   const size_t k)
+LSHModel<SortPolicy, ObjectiveFunction>::
+LSHModel(const arma::mat &referenceSet,
+         const double sampleSize,
+         const size_t k)
 {
   // We don't own the set - we just point to it.
   ownsSet = false;
   this->referenceSet = &referenceSet;
 
-  Train(referenceSet, minRecall, sampleSize, k);
+  Train(referenceSet, sampleSize, k);
 }
 
 // Destructor must de-allocate any referenceSet and LSHSearch objects we own.
@@ -42,36 +42,37 @@ LSHModel<SortPolicy, ObjectiveFunction>::~LSHModel()
 
 // Trains the object.
 template <typename SortPolicy, typename ObjectiveFunction>
-void LSHModel<SortPolicy, ObjectiveFunction>::Train(const arma::mat &referenceSet,
-                     const double minRecall,
-                     const double sampleSize,
-                     const size_t k)
+void LSHModel<SortPolicy, ObjectiveFunction>::Train(
+    const arma::mat &referenceSet,
+    const double sampleRate,
+    const size_t k)
 {
-  // TODO: Implement
-
-  // Sanity Check: Verify that recall and sampleSize are in [0, 1).
-  if (minRecall >= 1 || minRecall < 0)
-    throw std::runtime_error("Recall must be floating point number in [0, 1)");
-
-  if (sampleSize > 1 || sampleSize <= 0)
+  // Sanity check - sample rate must be in (0, 1].
+  if (sampleRate > 1 || sampleRate <= 0)
     throw std::runtime_error(
         "Sampling rate must be floating point number in (0, 1]");
 
-  const size_t numPoints = referenceSet.n_cols; // Points in original set.
+  // Update the object's max K value information.
+  maxKValue = k;
+
+  // Save pointer to training set.
+  this->referenceSet = &referenceSet;
 
   // Step 1. Select a random sample of the dataset. We will work with only that
   // sample.
+
   arma::vec sampleHelper(referenceSet.n_cols, arma::fill::randu);
 
   // Keep a sample of the dataset: We have uniformly random numbers in [0, 1],
-  // so we expect about N*sampleSize of them to be in [0, sampleSize).
+  // so we expect about N*sampleRate of them to be in [0, sampleRate).
   arma::mat sampleSet = referenceSet.cols(
-        arma::find(sampleHelper < sampleSize));
+        arma::find(sampleHelper < sampleRate));
   // Shuffle to be impartial (in case dataset is sorted in some way).
   sampleSet = arma::shuffle(sampleSet);
   const size_t numSamples = sampleSet.n_cols; // Points in sampled set.
 
-  Log::Info << "Sampled " << numSamples << " points to train with." << std::endl;
+  Log::Info << "Training model with " << numSamples << " points in sample set."
+    << std::endl;
 
   // Step 2. Compute all-vs-all distances of points in the sample.
   // The distance matrix is symmetric, so we only compute elements above the
@@ -89,9 +90,9 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(const arma::mat &referenceSe
   // Step 3. Estimate statistics of these distances: log(mean(d)), mean(log(d)),
   // mean(d).
   distances = arma::pow(distances, 2);
-  meanDist = arma::mean(distances);
-  logMeanDist = std::log(meanDist);
-  meanLogDist = arma::mean(arma::log(distances));
+  this->meanDist = arma::mean(distances);
+  this->logMeanDist = std::log(meanDist);
+  this->meanLogDist = arma::mean(arma::log(distances));
 
   // Step 4. Select a small part of the sample as 'anchor points'. Use the rest
   // of the sample as the reference set. Find the k-Nearest Neighbors' distances
@@ -101,7 +102,7 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(const arma::mat &referenceSe
   // The geometric mean of N numbers is the Nth root of the product of the
   // numbers. Through logarithmic properties though, this becomes computable
   // through exponentiating the mean of the logarithms of x:
-  // mean(log(x)) = geometricmean(x).
+  // exp(mean(log(x))) = geometricmean(x).
 
   // Number of samples to create for modeling the Gamma Distributions
   size_t regressionExamples = 50; // TODO: parameter?
@@ -119,8 +120,8 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(const arma::mat &referenceSe
   arma::mat Ek(regressionExamples, k);
   arma::mat Gk(regressionExamples, k);
 
-  Timer::Start("neighbors_distances");
   // For each referenceSize, calculate the kNN of the anchors
+  Log::Info.ignoreInput = true; // Ignore kNN output.
   for (size_t i = 0; i < regressionExamples; ++i)
   {
     // TODO: Since we've already computed this, avoid calling kNN?
@@ -137,19 +138,69 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(const arma::mat &referenceSe
     Ek.row(i) = arma::mean(kNNDistances.t());
     Gk.row(i) = arma::exp(arma::mean(arma::log(kNNDistances.t()), 0));
   }
-  Timer::Stop("neighbors_distances");
+  Log::Info.ignoreInput = false; // Keep giving normal output.
 
   // Step 5. Model the arithmetic and geometric mean according to the paper.
   // This will produce 6 parameters (aE, bE, cE, aG, bG, cG).
   // Vector of k values.
+  Timer::Start("neighbor_statistic_regression");
   arma::Col<size_t> kValues = arma::linspace<arma::Col<size_t>>(1, k, k);
   ApproximateKNNStatistics(referenceSizes, kValues, Ek, Gk);
+  Timer::Stop("neighbor_statistic_regression");
+}
+
+// Predict parameters for LSH that will have acceptable recall.
+template <typename SortPolicy, typename ObjectiveFunction>
+void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
+                                                      const size_t k,
+                                                      const double minRecall)
+{
+  // Sanity check. Recall can't be greater/equal to 1, or negative.
+  if (minRecall < 0 || minRecall >=1)
+    throw std::runtime_error("minRecall must be in [0, 1)");
+
+  // If the object wasn't trained, die here.
+  if (referenceSet == NULL)
+    Log::Fatal << "Attempt to use Predict() on untrained Object. Exiting."
+        << std::endl;
+
+  // Before proceeding, if requested K is larger than the k we trained with,
+  // re-train the object.
+  if (k > maxKValue)
+  {
+
+    // Otherwise, warn the user of the re-training and re-train.
+    Log::Warn << "Larger k requested; Re-training the LSHModel "
+      "with default sampling rate and new k." << std::endl;
+    Train(*referenceSet, 0.1, k); // Default sampling rate.
+  }
+  // Steps 1 - 5 happen in Train().
 
   // Step 6. Fit Gamma distributions to pairwise distances and kNN distances,
   // generated or estimated in steps 3 and 5.
+  // Gamma distribution for pairwise distances.
+  arma::vec logMeanVec(k + 1), meanLogVec(k + 1), meanVec(k + 1);
+  // Statistics were computed in Train()
+  meanVec(0) = this->meanDist;
+  logMeanVec(0) = this->logMeanDist;
+  meanLogVec(0) = this->meanLogDist;
+  // Train gamma and put in gammaDists[0].
+
+  Timer::Start("fitting_distributions");
+  for (size_t i = 1; i <= k; ++i)
+  {
+    meanVec(i) = aMeanPredictor.Predict(datasetSize, k);
+    logMeanVec(i) = std::log(meanVec(i));
+    // log(geometricMean) = \frac{1}{n} \sum(lnx_i) = mean(lnx) = meanLog
+    meanLogVec(i) = std::log(gMeanPredictor.Predict(datasetSize, k));
+  }
+  // Fit the distribution.
+  distancesDistribution.Train(logMeanVec, meanLogVec, meanVec);
+  Timer::Stop("fitting_distributions");
 
   // Step 7. Run Binary search on parameter space to minimize selectivity while
   // keeping recall above minimum.
+
 }
 
 // Fit two predictors, one for arithmetic mean E and one for geometric mean G.
diff --git a/src/mlpack/methods/lsh_model/lshmodel_main.cpp b/src/mlpack/methods/lsh_model/lshmodel_main.cpp
index 7192ce8ff96..00cd2d03e57 100644
--- a/src/mlpack/methods/lsh_model/lshmodel_main.cpp
+++ b/src/mlpack/methods/lsh_model/lshmodel_main.cpp
@@ -18,8 +18,12 @@ int main(int argc, char* argv[])
   // Generate a random point set.
   size_t N = 5000;
   size_t d = 10;
+  size_t k = 5;
+  double sampleSize = 0.25;
+  double minRecall = 0.4;
   arma::mat rdata(d, N, arma::fill::randu);
-  LSHModel<> model(rdata, 0.7, 0.25, 2);
+  LSHModel<> model(rdata, sampleSize, k);
+  model.Predict(N, k, minRecall);
 
   arma::mat qdata(d, 1, arma::fill::randu);
   arma::Mat<size_t> neighbors;

From cdcb575826bfb3bd0ef4cafacf465435b3d6d144 Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Fri, 12 Aug 2016 14:09:12 +0100
Subject: [PATCH 07/18] Adds Perturbation Sequence Generation (needs bugfixing)

---
 src/mlpack/methods/lsh_model/lshmodel.hpp     |  59 +++++
 .../methods/lsh_model/lshmodel_impl.hpp       | 228 +++++++++++++++++-
 2 files changed, 281 insertions(+), 6 deletions(-)

diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh_model/lshmodel.hpp
index 2890b0a91ce..ca146f4d503 100644
--- a/src/mlpack/methods/lsh_model/lshmodel.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel.hpp
@@ -165,6 +165,65 @@ class LSHModel
    void Serialize(Archive& ar);
 
  private:
+
+  /**
+   * Returns the score of a perturbation vector generated by perturbation set A.
+   * The score of a pertubation set (vector) is the sum of scores of the
+   * participating actions.
+   * @param A perturbation set to compute the score of.
+   * @param scores vector containing score of each perturbation.
+  */
+  double PerturbationScore(const std::vector<bool>& A,
+                                  const arma::vec& scores) const;
+  /**
+   * Inline function used by GetAdditionalProbingBins. The vector shift operation
+   * replaces the largest element of a vector A with (largest element) + 1.
+   * Returns true if resulting vector is valid, otherwise false.
+   * @param A perturbation set to shift.
+  */
+  bool PerturbationShift(std::vector<bool>& A) const;
+
+  /**
+   * Inline function used by GetAdditionalProbingBins. The vector expansion
+   * operation adds the element [1 + (largest_element)] to a vector A, where
+   * largest_element is the largest element of A. Returns true if resulting vector
+   * is valid, otherwise false.
+   * @param A perturbation set to expand.
+  */
+  bool PerturbationExpand(std::vector<bool>& A) const;
+
+  /**
+   * Return true if perturbation set A is valid. A perturbation set is invalid if
+   * it contains two (or more) actions for the same dimension or dimensions that
+   * are larger than the queryCode's dimensions.
+   * @param A perturbation set to validate.
+   * @param numProj The number of projections for the sequence under validation.
+  */
+  bool PerturbationValid(const std::vector<bool>& A, size_t numProj) const;
+   /**
+    * Function that creates a template perturbation sequence given a value for
+    * an M and a W. The template perturbation sequence is based on the
+    * statistical properties of multi-probe LSH and uses those, instead of
+    * specific points, to generate scores.
+    * See mlpack/methods/lsh/lsh_search_impl.hpp for more details about how
+    * perturbation sequences are generated from specific points.
+    *
+    * @param numProj The number of projections for the LSH scheme for which we
+    *     want to compute the template perturbation sequence.
+    * @param hashWidth The hash width for the LSH scheme.
+    * @param numProbes The number of probes to generate.
+    */
+   void GenerateTemplateSequence(size_t numProj, 
+                                 double hashWidth, 
+                                 size_t numProbes);
+
+   /** Matrix that stores, in each column, the "direction" of the perturbation:
+    * 0 means no perturbation on that dimension, -1 means reduce dimension value
+    * by 1, and +1 means increase dimension value by 1.
+    */
+   
+   arma::Mat<short int> templateSequence;
+
    /**
     * Function that fits two DistanceStatisticPredictors - one
     * to predict arithmetic mean and one to preduct geometric mean.
diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
index 50bf1869e12..b2de464eb6c 100644
--- a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh_model/lshmodel_impl.hpp
@@ -49,8 +49,8 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
 {
   // Sanity check - sample rate must be in (0, 1].
   if (sampleRate > 1 || sampleRate <= 0)
-    throw std::runtime_error(
-        "Sampling rate must be floating point number in (0, 1]");
+    Log::Fatal << "Sampling rate must be floating point number in (0, 1]"
+        << std::endl;
 
   // Update the object's max K value information.
   maxKValue = k;
@@ -60,7 +60,6 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
 
   // Step 1. Select a random sample of the dataset. We will work with only that
   // sample.
-
   arma::vec sampleHelper(referenceSet.n_cols, arma::fill::randu);
 
   // Keep a sample of the dataset: We have uniformly random numbers in [0, 1],
@@ -125,8 +124,10 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
   for (size_t i = 0; i < regressionExamples; ++i)
   {
     // TODO: Since we've already computed this, avoid calling kNN?
+
     // Reference set for kNN
-    arma::mat refMat = sampleSet.cols(numAnchors, numAnchors + referenceSizes(i) );
+    arma::mat refMat = sampleSet.cols(numAnchors,
+        numAnchors + referenceSizes(i));
 
     arma::Mat<size_t> neighbors; // Not going to be used but required.
     arma::mat kNNDistances; // What we need.
@@ -149,6 +150,7 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
   Timer::Stop("neighbor_statistic_regression");
 }
 
+
 // Predict parameters for LSH that will have acceptable recall.
 template <typename SortPolicy, typename ObjectiveFunction>
 void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
@@ -157,7 +159,7 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
 {
   // Sanity check. Recall can't be greater/equal to 1, or negative.
   if (minRecall < 0 || minRecall >=1)
-    throw std::runtime_error("minRecall must be in [0, 1)");
+    Log::Fatal << "Parameter minRecall must be in [0, 1)" << std::endl;
 
   // If the object wasn't trained, die here.
   if (referenceSet == NULL)
@@ -174,7 +176,8 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
       "with default sampling rate and new k." << std::endl;
     Train(*referenceSet, 0.1, k); // Default sampling rate.
   }
-  // Steps 1 - 5 happen in Train().
+
+  // Note: Steps 1 - 5 happen in Train().
 
   // Step 6. Fit Gamma distributions to pairwise distances and kNN distances,
   // generated or estimated in steps 3 and 5.
@@ -189,6 +192,8 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
   Timer::Start("fitting_distributions");
   for (size_t i = 1; i <= k; ++i)
   {
+    // Use the arithmetic and geometric mean predictors that were trained in
+    // Train() to estimate the statistics for the given datasetSize and k.
     meanVec(i) = aMeanPredictor.Predict(datasetSize, k);
     logMeanVec(i) = std::log(meanVec(i));
     // log(geometricMean) = \frac{1}{n} \sum(lnx_i) = mean(lnx) = meanLog
@@ -198,6 +203,9 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
   distancesDistribution.Train(logMeanVec, meanLogVec, meanVec);
   Timer::Stop("fitting_distributions");
 
+  // See if works
+  //GenerateTemplateSequence(3, 0.5, 8);
+
   // Step 7. Run Binary search on parameter space to minimize selectivity while
   // keeping recall above minimum.
 
@@ -251,6 +259,211 @@ LSHObject(const size_t numProjIn,
   return lshObjectVector[lshObjectVector.size() - 1];
 }
 
+// Helper function to generate perturbations.
+template<typename SortPolicy, typename ObjectiveFunction>
+inline force_inline
+double LSHModel<SortPolicy, ObjectiveFunction>::PerturbationScore(
+    const std::vector<bool>& A,
+    const arma::vec& scores) const
+{
+  double score = 0.0;
+  for (size_t i = 0; i < A.size(); ++i)
+    if (A[i])
+      score += scores(i); // add scores of non-zero indices
+  return score;
+}
+
+// Helper function to generate perturbations.
+template<typename SortPolicy, typename ObjectiveFunction>
+inline force_inline
+bool LSHModel<SortPolicy, ObjectiveFunction>::PerturbationShift(
+    std::vector<bool>& A) const
+{
+  size_t maxPos = 0;
+  for (size_t i = 0; i < A.size(); ++i)
+    if (A[i] == 1) // Marked true.
+      maxPos = i;
+
+  if (maxPos + 1 < A.size()) // Otherwise, this is an invalid vector.
+  {
+    A[maxPos] = 0;
+    A[maxPos + 1] = 1;
+    return true; // valid
+  }
+  return false; // invalid
+}
+
+// Helper function to generate perturbations.
+template<typename SortPolicy, typename ObjectiveFunction>
+inline force_inline
+bool LSHModel<SortPolicy, ObjectiveFunction>::PerturbationExpand(
+    std::vector<bool>& A) const
+{
+  // Find the last '1' in A.
+  size_t maxPos = 0;
+  for (size_t i = 0; i < A.size(); ++i)
+    if (A[i]) // Marked true.
+      maxPos = i;
+
+  if (maxPos + 1 < A.size()) // Otherwise, this is an invalid vector.
+  {
+    A[maxPos + 1] = 1;
+    return true;
+  }
+  return false;
+}
+
+// Helper function to generate perturbations.
+template<typename SortPolicy, typename ObjectiveFunction>
+inline force_inline
+bool LSHModel<SortPolicy, ObjectiveFunction>::PerturbationValid(
+    const std::vector<bool>& A,
+    size_t numProj) const
+{
+  // Use check to mark dimensions we have seen before in A. If a dimension is
+  // seen twice (or more), A is not a valid perturbation.
+  std::vector<bool> check(numProj);
+
+  if (A.size() > 2 * numProj)
+    return false; // This should never happen.
+
+  // Check that we only see each dimension once. If not, vector is not valid.
+  for (size_t i = 0; i < A.size(); ++i)
+  {
+    // Only check dimensions that were included.
+    if (!A[i])
+      continue;
+
+    // If dimesnion is unseen thus far, mark it as seen.
+    if (check[i % numProj] == false)
+      check[i % numProj] = true;
+    else
+      return false; // If dimension was seen before, set is not valid.
+  }
+  // If we didn't fail, set is valid.
+  return true;
+}
+
+// Generate a probing sequence for a given M, W and T.
+template <typename SortPolicy, typename ObjectiveFunction>
+void LSHModel<SortPolicy, ObjectiveFunction>::GenerateTemplateSequence(
+    size_t numProj,
+    double hashWidth,
+    size_t numProbes)
+{
+  // If no additional probes requested, stop here.
+  if (numProbes == 0)
+    return;
+
+  // If number of additional probes exceeds possible, set to max possible.
+  if (numProbes > ((1 << numProj) - 1))
+    numProbes = (1 << numProj) - 1;
+
+  // Calculate the expected scores based on Multi-probe LSH paper.
+  arma::vec scores(2 * numProj);
+  double M = (double) numProj; // To avoid integer division headache.
+  // "Positive" scores.
+  for (size_t j = 0; j < numProj; ++j)
+    scores(j) = pow(hashWidth, 2) * (j + 1 * (j + 2))/(4 * (M + 1) * (M + 2));
+  // "Negative" scores.
+  for (size_t j = numProj; j < 2 * numProj; ++j)
+    scores(j) = pow(hashWidth, 2) *
+      (1 -
+       (2 * M + 1 - (j + 1))/(M + 1) +
+       ((2 * M + 1 - (j + 1)) * (2 * M + 2 - (j + 1)))/(4 * (M + 1) * (M + 2)));
+  cout << scores << endl;
+
+  // A "+1" signifies a positive perturbation, a "-1" a negative one.
+  arma::Col<short int> actions(2 * numProj); // will be [1 ... -1 ...]
+  actions.rows(0, numProj - 1) = // First numProj rows.
+    arma::ones< arma::Col<short int> > (numProj); // 1s
+  actions.rows(numProj, (2 * numProj) - 1) = // Last numProj rows.
+    -1 * arma::ones< arma::Col<short int> > (numProj); // -1s
+
+  // The "acting dimension", or which of the numProj dimension to increase or
+  // reduce according to the "actions".
+  arma::Col<size_t> positions(2 * numProj); // Will be [0 1 2 ... 0 1 2 ...].
+  positions.rows(0, numProj - 1) =
+    arma::linspace< arma::Col<size_t> >(0, numProj - 1, numProj);
+  positions.rows(numProj, 2 * numProj - 1) =
+    arma::linspace< arma::Col<size_t> >(0, numProj - 1, numProj);
+
+  // Sort all three vectors so smaller scoring perturbations are first.
+  arma::uvec sortidx = arma::sort_index(scores);
+  scores = scores(sortidx);
+  actions = actions(sortidx);
+  positions = positions(sortidx);
+
+  // From LSHSearch::GetAdditionalProbingBins. TODO: Modularize?
+
+  // Perturbation sets (A) mark with 1 the (score, action, dimension) positions
+  // included in a given perturbation vector. Other spaces are 0.
+  std::vector<bool> Ao(2 * numProj);
+  Ao[0] = 1; // Smallest vector includes only smallest score.
+
+  std::vector< std::vector<bool> > perturbationSets;
+  perturbationSets.push_back(Ao); // Storage of perturbation sets.
+
+  std::priority_queue<
+    std::pair<double, size_t>,        // contents: pairs of (score, index)
+    std::vector<                      // container: vector of pairs
+      std::pair<double, size_t>
+      >,
+    std::greater< std::pair<double, size_t> > // comparator of pairs
+  > minHeap; // our minheap
+
+  // Start by adding the lowest scoring set to the minheap.
+  minHeap.push( std::make_pair(PerturbationScore(Ao, scores), 0) );
+
+  // Loop invariable: after pvec iterations, additionalProbingBins contains pvec
+  // valid codes of the lowest-scoring bins (bins most likely to contain
+  // neighbors of the query).
+
+  // Allocate 1 column per perturbed "code".
+  this->templateSequence.zeros(numProj, numProbes);
+  for (size_t pvec = 0; pvec < numProbes; ++pvec)
+  {
+    std::vector<bool> Ai;
+    do
+    {
+      // Get the perturbation set corresponding to the minimum score.
+      Ai = perturbationSets[ minHeap.top().second ];
+      minHeap.pop(); // .top() returns, .pop() removes
+
+      // Shift operation on Ai (replace max with max+1).
+      std::vector<bool> As = Ai;
+      if (PerturbationShift(As) && PerturbationValid(As, numProj))
+        // Don't add invalid sets.
+      {
+        perturbationSets.push_back(As); // add shifted set to sets
+        minHeap.push(
+            std::make_pair(PerturbationScore(As, scores),
+            perturbationSets.size() - 1));
+      }
+
+      // Expand operation on Ai (add max+1 to set).
+      std::vector<bool> Ae = Ai;
+      if (PerturbationExpand(Ae) && PerturbationValid(Ae, numProj))
+        // Don't add invalid sets.
+      {
+        perturbationSets.push_back(Ae); // add expanded set to sets
+        minHeap.push(
+            std::make_pair(PerturbationScore(Ae, scores),
+            perturbationSets.size() - 1));
+      }
+
+    } while (!PerturbationValid(Ai, numProj));//Discard invalid perturbations
+
+    // Found valid perturbation set Ai. Construct perturbation vector from set.
+    for (size_t pos = 0; pos < Ai.size(); ++pos)
+    {
+      // If Ai[pos] is marked, set template to +/- 1.
+      if (Ai[pos] == 1)
+        templateSequence(positions(pos), pvec) = actions(pos);
+    }
+  }
+}
+
 // Fit a curve to the data provided.
 template<typename SortPolicy, typename ObjectiveFunction>
 double LSHModel<SortPolicy, ObjectiveFunction>::DistanceStatisticPredictor::Train(
@@ -266,7 +479,10 @@ double LSHModel<SortPolicy, ObjectiveFunction>::DistanceStatisticPredictor::Trai
 
   // Get an initial point from the optimizer.
   arma::mat currentPoint = f.GetInitialPoint();
+  // Silence debug output of L_BFGS (TODO: remove)
+  Log::Debug.ignoreInput = true;
   double result = opt.Optimize(currentPoint);
+  Log::Debug.ignoreInput = false;
 
   // Optimizer is done - set alpha, beta, gamma.
   this->alpha = currentPoint(0, 0);

From 78cab0be59414f3fa598bdadd61f4ed92742ece9 Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Tue, 16 Aug 2016 11:02:11 +0100
Subject: [PATCH 08/18] Merges the LSH and LSHModel folders

---
 src/mlpack/methods/CMakeLists.txt             |  1 -
 src/mlpack/methods/lsh/CMakeLists.txt         |  5 +++++
 .../methods/{lsh_model => lsh}/lshmodel.hpp   |  0
 .../{lsh_model => lsh}/lshmodel_impl.hpp      |  0
 .../{lsh_model => lsh}/lshmodel_main.cpp      |  0
 .../{lsh_model => lsh}/objectivefunction.hpp  |  0
 src/mlpack/methods/lsh_model/CMakeLists.txt   | 19 -------------------
 7 files changed, 5 insertions(+), 20 deletions(-)
 rename src/mlpack/methods/{lsh_model => lsh}/lshmodel.hpp (100%)
 rename src/mlpack/methods/{lsh_model => lsh}/lshmodel_impl.hpp (100%)
 rename src/mlpack/methods/{lsh_model => lsh}/lshmodel_main.cpp (100%)
 rename src/mlpack/methods/{lsh_model => lsh}/objectivefunction.hpp (100%)
 delete mode 100644 src/mlpack/methods/lsh_model/CMakeLists.txt

diff --git a/src/mlpack/methods/CMakeLists.txt b/src/mlpack/methods/CMakeLists.txt
index 6a098340339..dbbd2318bee 100644
--- a/src/mlpack/methods/CMakeLists.txt
+++ b/src/mlpack/methods/CMakeLists.txt
@@ -36,7 +36,6 @@ set(DIRS
   local_coordinate_coding
   logistic_regression
   lsh
-  lsh_model
 #  mvu
   matrix_completion
   naive_bayes
diff --git a/src/mlpack/methods/lsh/CMakeLists.txt b/src/mlpack/methods/lsh/CMakeLists.txt
index 3540e04c7b9..2660fdb3df2 100644
--- a/src/mlpack/methods/lsh/CMakeLists.txt
+++ b/src/mlpack/methods/lsh/CMakeLists.txt
@@ -4,6 +4,9 @@ set(SOURCES
   # LSH-search class
   lsh_search.hpp
   lsh_search_impl.hpp
+  # LSH-model class
+  lshmodel.hpp
+  lshmodel_impl.hpp
 )
 
 # Add directory name to sources.
@@ -18,3 +21,5 @@ set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
 # The code to compute the approximate neighbor for the given query and reference
 # sets with p-stable LSH.
 add_cli_executable(lsh)
+# The code that models LSH to make parameter tuning easier.
+add_cli_executable(lshmodel)
diff --git a/src/mlpack/methods/lsh_model/lshmodel.hpp b/src/mlpack/methods/lsh/lshmodel.hpp
similarity index 100%
rename from src/mlpack/methods/lsh_model/lshmodel.hpp
rename to src/mlpack/methods/lsh/lshmodel.hpp
diff --git a/src/mlpack/methods/lsh_model/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp
similarity index 100%
rename from src/mlpack/methods/lsh_model/lshmodel_impl.hpp
rename to src/mlpack/methods/lsh/lshmodel_impl.hpp
diff --git a/src/mlpack/methods/lsh_model/lshmodel_main.cpp b/src/mlpack/methods/lsh/lshmodel_main.cpp
similarity index 100%
rename from src/mlpack/methods/lsh_model/lshmodel_main.cpp
rename to src/mlpack/methods/lsh/lshmodel_main.cpp
diff --git a/src/mlpack/methods/lsh_model/objectivefunction.hpp b/src/mlpack/methods/lsh/objectivefunction.hpp
similarity index 100%
rename from src/mlpack/methods/lsh_model/objectivefunction.hpp
rename to src/mlpack/methods/lsh/objectivefunction.hpp
diff --git a/src/mlpack/methods/lsh_model/CMakeLists.txt b/src/mlpack/methods/lsh_model/CMakeLists.txt
deleted file mode 100644
index c3799753aec..00000000000
--- a/src/mlpack/methods/lsh_model/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-# Define the files we need to compile.
-# Anything not in this list will not be compiled into mlpack.
-set(SOURCES
-  # LSH-model class
-  lshmodel.hpp
-  lshmodel_impl.hpp
-)
-
-# Add directory name to sources.
-set(DIR_SRCS)
-foreach(file ${SOURCES})
-  set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
-endforeach()
-# Append sources (with directory name) to list of all mlpack sources (used at
-# the parent scope).
-set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
-
-# The code that models LSH to return a set of parameters that works well.
-add_cli_executable(lshmodel)

From 2c88406a752ae797089cb8c617cc698963fe8500 Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Tue, 16 Aug 2016 11:10:25 +0100
Subject: [PATCH 09/18] Removes LogDebug.ignoreInput statement

---
 src/mlpack/methods/lsh/lshmodel_impl.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp
index b2de464eb6c..476a89d0c85 100644
--- a/src/mlpack/methods/lsh/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp
@@ -480,9 +480,7 @@ double LSHModel<SortPolicy, ObjectiveFunction>::DistanceStatisticPredictor::Trai
   // Get an initial point from the optimizer.
   arma::mat currentPoint = f.GetInitialPoint();
   // Silence debug output of L_BFGS (TODO: remove)
-  Log::Debug.ignoreInput = true;
   double result = opt.Optimize(currentPoint);
-  Log::Debug.ignoreInput = false;
 
   // Optimizer is done - set alpha, beta, gamma.
   this->alpha = currentPoint(0, 0);

From f4af3dc878f5f9681078cee7ec678557eace455c Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Sun, 21 Aug 2016 13:47:13 +0100
Subject: [PATCH 10/18] Separates DistanceStatisticPredictor implementation

---
 .../lsh/distance_statistic_predictor.hpp      | 125 ++++++++++
 src/mlpack/methods/lsh/lshmodel.hpp           | 153 +++++-------
 src/mlpack/methods/lsh/lshmodel_impl.hpp      | 233 +++++++++++++-----
 src/mlpack/methods/lsh/lshmodel_main.cpp      |   7 +-
 4 files changed, 355 insertions(+), 163 deletions(-)
 create mode 100644 src/mlpack/methods/lsh/distance_statistic_predictor.hpp

diff --git a/src/mlpack/methods/lsh/distance_statistic_predictor.hpp b/src/mlpack/methods/lsh/distance_statistic_predictor.hpp
new file mode 100644
index 00000000000..da1caf5a699
--- /dev/null
+++ b/src/mlpack/methods/lsh/distance_statistic_predictor.hpp
@@ -0,0 +1,125 @@
+/** 
+ * @file distance_statistic_predictor.hpp
+ * @author Yannis Mentekidis
+ *
+ * This file defines a helper class that uses the function a * k^b * N^c for 
+ * some parameters a, b, c that have been fit to either predict the arithmetic 
+ * or geometric mean of the squared distance of a point to its k-nearest
+ * neighbor, given some dataset size N and its k-nearest neighbor.
+ *
+ * DistanceStatisticPredictor objects are used by the LSHModel class of mlpack.
+ */
+#ifndef MLPACK_METHODS_NEIGHBOR_SEARCH_DISTANCE_STATISTIC_PREDICTOR_HPP
+#define MLPACK_METHODS_NEIGHBOR_SEARCH_DISTANCE_STATISTIC_PREDICTOR_HPP
+
+// For curve fitting.
+#include <mlpack/core/optimizers/lbfgs/lbfgs.hpp>
+// Default objective function.
+#include "objectivefunction.hpp"
+
+namespace mlpack
+{
+namespace neighbor
+{
+
+template <typename ObjectiveFunction = DefaultObjectiveFunction>
+class DistanceStatisticPredictor
+{
+ public:
+  //! Empty constructor.
+  DistanceStatisticPredictor() { };
+
+  /** 
+   * Function to construct with training set.
+   *
+   * @param inputSize A vector of input sizes. The first input variable of 
+   *     the regression.
+   * @param kValues A vector of k values. The second input variable of the
+   *     regression.
+   * @param statistic A vector of responses - the value of the statistic for
+   *     each given inputSize.
+   */
+  DistanceStatisticPredictor(const arma::Col<size_t>& inputSize, 
+                             const arma::Col<size_t>& kValues,
+                             const arma::mat& statistic) 
+  { Train(inputSize, kValues, statistic); };
+  
+  //! Default destructor.
+  ~DistanceStatisticPredictor() { };
+
+  /**
+   * Function that fits the alpha, beta and gamma parameters.
+   *
+   * @param inputSize A vector of input sizes. The first input variable of 
+   *     the regression.
+   * @param kValues A vector of k values. The second input variable of the
+   *     regression.
+   * @param statistic A vector of responses - the value of the statistic for
+   *     each given inputSize.
+   */
+  double Train(const arma::Col<size_t>& inputSize, 
+               const arma::Col<size_t>& kValues,
+               const arma::mat& statistic);
+
+  /** 
+   * Evaluate the statistic for a given dataset size.
+   *
+   * @param N - a new input size for which to evaluate the expected
+   *     statistic.
+   */
+  double Predict(size_t N, size_t k) 
+  { return alpha * std::pow(k, beta) * std::pow(N, gamma); };
+
+  //! Set the alpha parameter.
+  void Alpha(double a) { alpha = a; };
+
+  //! Get the alpha parameter.
+  double Alpha(void) { return alpha; };
+  
+  //! Set the beta parameter.
+  void Beta(double b) { beta = b; };
+
+  //! Get the beta parameter.
+  double Beta(void) { return beta; };
+
+  //! Set the gamma parameter.
+  void Gamma(double c) { gamma = c; };
+
+  //! Get the gamma parameter.
+  double Gamma(void) { return gamma; };
+
+ private:
+  double alpha;
+  double beta;
+  double gamma;
+};
+
+// Fit a curve to the data provided.
+template <typename ObjectiveFunction>
+double DistanceStatisticPredictor<ObjectiveFunction>::Train(
+    const arma::Col<size_t>& inputSize,
+    const arma::Col<size_t>& kValues,
+    const arma::mat& statistic)
+{
+  // Objective function for fitting the E(x, k) curve to the statistic.
+  ObjectiveFunction f(inputSize, kValues, statistic);
+
+  // Optimizer. Use L_BFGS (TODO: Make this a template parameter?)
+  mlpack::optimization::L_BFGS<ObjectiveFunction> opt(f);
+
+  // Get an initial point from the optimizer.
+  arma::mat currentPoint = f.GetInitialPoint();
+  double result = opt.Optimize(currentPoint);
+
+  // Optimizer is done - set alpha, beta, gamma.
+  this->alpha = currentPoint(0, 0);
+  this->beta = currentPoint(1, 0);
+  this->gamma = currentPoint(2, 0);
+
+  return result;
+}
+
+} // namespace neighbor
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/lsh/lshmodel.hpp b/src/mlpack/methods/lsh/lshmodel.hpp
index ca146f4d503..1c839d6ebba 100644
--- a/src/mlpack/methods/lsh/lshmodel.hpp
+++ b/src/mlpack/methods/lsh/lshmodel.hpp
@@ -45,12 +45,10 @@
 // For template parameters and kNN search (if nescessary).
 #include <mlpack/methods/neighbor_search/neighbor_search.hpp>
 #include <mlpack/methods/neighbor_search/sort_policies/nearest_neighbor_sort.hpp>
-// For curve fitting.
-#include <mlpack/core/optimizers/lbfgs/lbfgs.hpp>
-// Default objective function.
-#include "objectivefunction.hpp"
 // Gamma distribution for modeling squared distances.
 #include <mlpack/core/dists/gamma_distribution.hpp>
+// For fitting distance statistic regressors.
+#include "distance_statistic_predictor.hpp"
 
 namespace mlpack {
 namespace neighbor {
@@ -111,8 +109,9 @@ class LSHModel
               const size_t maxKValue = 32);
 
    /**
-    * Predict() finds LSH parameters that should work well for the dataset the 
-    * LSHModel was trained for. 
+    * This function uses the trained model to predict recall / selectivity
+    * values for a given parameter set.
+    *
     * Warning: If the k specified is larger than the maxKValue passed to
     * Train(), Train() will be called again. This might have adverse effects to
     * performance.
@@ -123,7 +122,12 @@ class LSHModel
     */
    void Predict(const size_t datasetSize, 
                 const size_t k, 
-                const double minRecall);
+                const size_t numTables,
+                const size_t numProj,
+                const size_t numProbes,
+                const double hashWidth,
+                double& predictedRecall,
+                double& predictedSelect);
 
    /**
     * This function returns an LSHSearch object trained with the parameters
@@ -200,29 +204,59 @@ class LSHModel
    * @param numProj The number of projections for the sequence under validation.
   */
   bool PerturbationValid(const std::vector<bool>& A, size_t numProj) const;
+
    /**
     * Function that creates a template perturbation sequence given a value for
     * an M and a W. The template perturbation sequence is based on the
     * statistical properties of multi-probe LSH and uses those, instead of
-    * specific points, to generate scores.
+    * specific points, to generate scores. The template sequence is also
+    * independent of the hashWidth, and depends only on numProj and numProbes.
+    *
     * See mlpack/methods/lsh/lsh_search_impl.hpp for more details about how
     * perturbation sequences are generated from specific points.
     *
     * @param numProj The number of projections for the LSH scheme for which we
     *     want to compute the template perturbation sequence.
-    * @param hashWidth The hash width for the LSH scheme.
     * @param numProbes The number of probes to generate.
     */
    void GenerateTemplateSequence(size_t numProj, 
-                                 double hashWidth, 
                                  size_t numProbes);
 
-   /** Matrix that stores, in each column, the "direction" of the perturbation:
-    * 0 means no perturbation on that dimension, -1 means reduce dimension value
-    * by 1, and +1 means increase dimension value by 1.
+   /**
+    * This function evaluates the probability that two points that are at
+    * distance chi from each other will be neighbors when we use LSH with a
+    * specific number of projections, probing bins, and tables for a given hash
+    * width.
+    *
+    * @param chi The distance of two points.
+    * @param hashWidth The first-level hash width.
+    * @param numTables The number of random projection tables used by LSH.
+    * @param numProj The number of projections per hash table (dimensionality of
+    *     new space).
+    * @param numProbes The number of additional probing bins of Multiprobe LSH.
     */
-   
-   arma::Mat<short int> templateSequence;
+   //TODO: inline?
+   double Rho(double chi,
+              double hashWidth,
+              size_t numTables,
+              size_t numProj, 
+              size_t numProbes);
+   /**
+    * This is a helper function that is called by Rho() and returns the inner
+    * value of the product used in the calculation of the probability that Rho
+    * calculates.
+    *
+    * @param chi The distance of two points.
+    * @param hashWidth The first-level hash width.
+    * @param delta The perturbation to evaluate for.
+    * @param proj The projection we evaluate for ( 0 <= proj < numProj).
+    * @param numProj The total number of projections.
+    */
+   inline double SameBucketProbability(double chi, 
+                                       double hashWidth, 
+                                       short delta,
+                                       size_t proj,
+                                       size_t numProj);
 
    /**
     * Function that fits two DistanceStatisticPredictors - one
@@ -242,88 +276,19 @@ class LSHModel
                                  const arma::mat& Ek, 
                                  const arma::mat& Gk);
 
-   /**
-    * This is a helper class that uses the function a * k^b * N^c for some
-    * parameters a, b, c that have been fit to either predict the arithmetic or
-    * geometric mean of the squared distance of a point to its k-nearest
-    * neighbor, given some dataset size N and its k-nearest neighbor.
+
+   /** 
+    * Matrix that stores, in each column, the "direction" of the perturbation:
+    * 0 means no perturbation on that dimension, -1 means reduce dimension value
+    * by 1, and +1 means increase dimension value by 1.
     */
-   class DistanceStatisticPredictor
-   {
-    public:
-      //! Empty constructor.
-      DistanceStatisticPredictor() { };
-
-      /** 
-       * Function to construct with training set.
-       *
-       * @param inputSize A vector of input sizes. The first input variable of 
-       *     the regression.
-       * @param kValues A vector of k values. The second input variable of the
-       *     regression.
-       * @param statistic A vector of responses - the value of the statistic for
-       *     each given inputSize.
-       */
-      DistanceStatisticPredictor(const arma::Col<size_t>& inputSize, 
-                                 const arma::Col<size_t>& kValues,
-                                 const arma::mat& statistic) 
-      { Train(inputSize, kValues, statistic); };
-      
-      //! Default destructor.
-      ~DistanceStatisticPredictor() { };
-
-      /**
-       * Function that fits the alpha, beta and gamma parameters.
-       *
-       * @param inputSize A vector of input sizes. The first input variable of 
-       *     the regression.
-       * @param kValues A vector of k values. The second input variable of the
-       *     regression.
-       * @param statistic A vector of responses - the value of the statistic for
-       *     each given inputSize.
-       */
-      double Train(const arma::Col<size_t>& inputSize, 
-                 const arma::Col<size_t>& kValues,
-                 const arma::mat& statistic);
-
-      /** 
-       * Evaluate the statistic for a given dataset size.
-       *
-       * @param N - a new input size for which to evaluate the expected
-       *     statistic.
-       */
-      double Predict(size_t N, size_t k) 
-      { return alpha * std::pow(k, beta) * std::pow(N, gamma); };
-
-      //! Set the alpha parameter.
-      void Alpha(double a) { alpha = a; };
-
-      //! Get the alpha parameter.
-      double Alpha(void) { return alpha; };
-      
-      //! Set the beta parameter.
-      void Beta(double b) { beta = b; };
-
-      //! Get the beta parameter.
-      double Beta(void) { return beta; };
-
-      //! Set the gamma parameter.
-      void Gamma(double c) { gamma = c; };
-
-      //! Get the gamma parameter.
-      double Gamma(void) { return gamma; };
-
-    private:
-      double alpha;
-      double beta;
-      double gamma;
-   };
+   arma::Mat<short int> templateSequence;
 
    //! DistanceStatisticPredictor for arithmetic mean.
-   DistanceStatisticPredictor aMeanPredictor;
+   DistanceStatisticPredictor<ObjectiveFunction> aMeanPredictor;
 
    //! DistanceStatisticPredictor for geometric mean.
-   DistanceStatisticPredictor gMeanPredictor;
+   DistanceStatisticPredictor<ObjectiveFunction> gMeanPredictor;
 
    //! (k+1)-dimensional gamma distribution for predicting squared distances.
    mlpack::distribution::GammaDistribution distancesDistribution;
@@ -349,8 +314,8 @@ class LSHModel
    //! Reference dataset.
    const arma::mat* referenceSet;
 
-   //! LSHSearch Object Vector.
-   std::vector<LSHSearch<SortPolicy>> lshObjectVector;
+   //! LSHSearch Object
+   LSHSearch<SortPolicy> trainedLSHObject;
 
    //! Statistic: average squared distance of points.
    double meanDist;
diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp
index 476a89d0c85..6ea549c9700 100644
--- a/src/mlpack/methods/lsh/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp
@@ -8,6 +8,7 @@
 #define MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_MODEL_IMPL_HPP
 
 #include "lshmodel.hpp"
+#include <boost/math/distributions/normal.hpp> // pdf and cdf needed
 
 
 //TODO: remove
@@ -151,22 +152,23 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
 }
 
 
-// Predict parameters for LSH that will have acceptable recall.
+// Predict recall / selectivity for the given parameters.
 template <typename SortPolicy, typename ObjectiveFunction>
 void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
                                                       const size_t k,
-                                                      const double minRecall)
+                                                      const size_t numTables,
+                                                      const size_t numProj,
+                                                      const size_t numProbes,
+                                                      const double hashWidth,
+                                                      double& predictedRecall,
+                                                      double& predictedSelect)
 {
-  // Sanity check. Recall can't be greater/equal to 1, or negative.
-  if (minRecall < 0 || minRecall >=1)
-    Log::Fatal << "Parameter minRecall must be in [0, 1)" << std::endl;
-
   // If the object wasn't trained, die here.
   if (referenceSet == NULL)
     Log::Fatal << "Attempt to use Predict() on untrained Object. Exiting."
         << std::endl;
 
-  // Before proceeding, if requested K is larger than the k we trained with,
+  // Before proceeding, if requested k is larger than the k we trained with,
   // re-train the object.
   if (k > maxKValue)
   {
@@ -203,12 +205,139 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
   distancesDistribution.Train(logMeanVec, meanLogVec, meanVec);
   Timer::Stop("fitting_distributions");
 
-  // See if works
-  //GenerateTemplateSequence(3, 0.5, 8);
+  // Step 7. Generate the Template Probing Sequence using the maximum number of
+  // projections and the maximum number of probes.
+  GenerateTemplateSequence(numProj, numProbes);
+
+  // Step 8. Use formulas (19) and (20) from the paper to predict recall and
+  // selectivity, using LSHModel::Rho() and the distribution functions of the
+  // gammas we fit back in Step 6.
+  predictedRecall = 0.5;
+  predictedSelect = 0.5;
+}
+
+
+/* NOTE: My interpretation of the paper would result in this code, but LSHKIT's
+ * implementation is different. I'm commenting this out to try their way, and I
+ * might go back to this if I see both work the same.
+
+// Probability of two points being neighbors if they are at distance chi.
+template <typename SortPolicy, typename ObjectiveFunction>
+double LSHModel<SortPolicy, ObjectiveFunction>::Rho(double chi,
+                                                    double hashWidth,
+                                                    size_t numTables,
+                                                    size_t numProj,
+                                                    size_t numProbes)
+{
+  // Calculate the formula:
+  // 1 - {Prod{1 - Prod{same_bin_probability}}}^numTables, where:
+  // * same_bin_probability is calculated with the Value() function.
+  // * Prod{same_bin_probability} is stored in product.
+  // * Prod{1 - Prod{same_bin_probability}} is stored in rho.
+
+  double rho = 1;
+
+  // Row-major loop :(. TODO: Refactor to make column-major.
+  for (size_t proj = 0; proj < numProj; ++proj)
+  {
+    double product = 1;
+    for (size_t probe = 0; probe < numProbes; ++probe)
+    {
+      // Use perturbation value (proj, probe), i.e. \delta_{\mu, \tau}
+      product *= Value(chi, hashWidth, templateSequence(proj, probe), numProj);
+    }
+
+    rho *= (1 - product);
+  }
+
+  return 1 - std::pow(rho, numTables);
+}
+
+// Probability of two points being neighbors if they are at distance chi.
+template <typename SortPolicy, typename ObjectiveFunction>
+double LSHModel<SortPolicy, ObjectiveFunction>::SameBucketProbability(double chi,
+                      double hashWidth,
+                      short delta,
+                      size_t proj,
+                      size_t numProj)
+{
+  if (delta == 0)
+  {
+    // No perturbation - probability of two queries sharing the same bin.
+    // Use the "default" normal distribution with mean = 0, sd = 1.
+    boost::math::normal_distribution phi;
+    return 2 * phi.pdf(hashWidth / chi) - 1
+      + std::sqrt(2 / M_PI) 
+      * (std::exp(-pow((hashWidth / chi), 2) / 2.0 - 1.0)) / (hashWidth / chi);
+  }
+  else
+  {
+    // +1/-1 perturbation - probability of two queries being in adjacent bins.
+    double deltaI = (proj + 1.0) / (2.0 * (numProj + 2.0));
+    
+    // Negative perturbation - flip deltaI.
+    if (delta == -1)
+      deltaI = 1 - deltaI;
+
+    boost::math::normal_distribution phi(-delta, chi);
+    return phi.cdf(hashWidth) - phi.cdf(0);
+  }
+}
+*/
+
+/*
+ * Based on the LSHKIT implementation, not my understanding of the paper.
+ */
+// Probability of two points being neighbors if they are at distance chi.
+template <typename SortPolicy, typename ObjectiveFunction>
+double LSHModel<SortPolicy, ObjectiveFunction>::Rho(double chi,
+                                                    double hashWidth,
+                                                    size_t numTables,
+                                                    size_t numProj,
+                                                    size_t numProbes)
+{
+  double rho = 0;
+
+  for (size_t probe = 0; probe < numProbes; ++probe)
+  {
+    double rTemp = 1;
+    for (size_t proj = 0; proj < numProj; ++proj)
+    {
+      rTemp *= SameBucketProbability(chi, hashWidth, 
+          templateSequence(proj, probe), proj, numProj);
+    }
+    rho += rTemp;
+  }
 
-  // Step 7. Run Binary search on parameter space to minimize selectivity while
-  // keeping recall above minimum.
+  return 1 - std::exp(std::log(1.0 - rho) * numTables);
+}
 
+// Probability of two points being neighbors if they are at distance chi.
+template <typename SortPolicy, typename ObjectiveFunction>
+double LSHModel<SortPolicy, ObjectiveFunction>::
+SameBucketProbability(double chi, double hashWidth, short delta, size_t proj,
+                      size_t numProj)
+{
+  boost::math::normal_distribution<> phi;
+  if (delta == 0)
+  {
+    // No perturbation - probability of two queries sharing the same bin.
+    return 2 * pdf(phi, hashWidth / chi) - 1
+      + std::sqrt(2 / M_PI) 
+      * (std::exp(-pow((hashWidth / chi), 2) / 2.0 - 1.0)) / (hashWidth / chi);
+  }
+  else
+  {
+    // +1/-1 perturbation - probability of two queries being in adjacent bins.
+    double deltaI = (proj + 1.0) / (2.0 * (numProj + 2.0));
+    
+    // Negative perturbation - flip deltaI.
+    if (delta == -1)
+      deltaI = 1 - deltaI;
+
+    return cdf(phi, hashWidth / chi * (1 + deltaI)) 
+        - cdf(phi, hashWidth / chi * deltaI);
+  }
 }
 
 // Fit two predictors, one for arithmetic mean E and one for geometric mean G.
@@ -254,9 +383,9 @@ LSHObject(const size_t numProjIn,
   LSHSearch<> lsh(*referenceSet, numProjOut, numTablesOut, hashWidthOut,
       secondHashSize, bucketSize);
 
-  lshObjectVector.push_back(lsh);
+  trainedLSHObject = lsh;
 
-  return lshObjectVector[lshObjectVector.size() - 1];
+  return trainedLSHObject;
 }
 
 // Helper function to generate perturbations.
@@ -348,30 +477,35 @@ bool LSHModel<SortPolicy, ObjectiveFunction>::PerturbationValid(
 template <typename SortPolicy, typename ObjectiveFunction>
 void LSHModel<SortPolicy, ObjectiveFunction>::GenerateTemplateSequence(
     size_t numProj,
-    double hashWidth,
     size_t numProbes)
 {
-  // If no additional probes requested, stop here.
+  // If no probes requested, stop here.
   if (numProbes == 0)
+  {
+    Log::Warn << "GenerateTemplateSequence called with numProbes = 0"
+      << std::endl;
     return;
+  }
 
   // If number of additional probes exceeds possible, set to max possible.
-  if (numProbes > ((1 << numProj) - 1))
-    numProbes = (1 << numProj) - 1;
+  if (numProbes > pow(3, numProj))
+    numProbes = pow(3, numProj); // {-1, 0, 1} for each probe.
 
   // Calculate the expected scores based on Multi-probe LSH paper.
   arma::vec scores(2 * numProj);
-  double M = (double) numProj; // To avoid integer division headache.
-  // "Positive" scores.
-  for (size_t j = 0; j < numProj; ++j)
-    scores(j) = pow(hashWidth, 2) * (j + 1 * (j + 2))/(4 * (M + 1) * (M + 2));
-  // "Negative" scores.
-  for (size_t j = numProj; j < 2 * numProj; ++j)
-    scores(j) = pow(hashWidth, 2) *
-      (1 -
-       (2 * M + 1 - (j + 1))/(M + 1) +
-       ((2 * M + 1 - (j + 1)) * (2 * M + 2 - (j + 1)))/(4 * (M + 1) * (M + 2)));
-  cout << scores << endl;
+  double M = (double) numProj;
+
+  // Generate expected scores in sorted order.
+  for (size_t i = 0; i < numProj; ++i)
+  {
+    // Everything is double to avoid integer division headache.
+    double left = double(i);
+    double right = 2 * M - left - 1;
+
+    // Expected score - left boundary.
+    scores[left] = (left + 1) * (left + 2) / (2 * (M + 1) * (M + 2));
+    scores[right] = 1 - (left + 1)/(M + 1) + scores[left];
+  }
 
   // A "+1" signifies a positive perturbation, a "-1" a negative one.
   arma::Col<short int> actions(2 * numProj); // will be [1 ... -1 ...]
@@ -388,14 +522,6 @@ void LSHModel<SortPolicy, ObjectiveFunction>::GenerateTemplateSequence(
   positions.rows(numProj, 2 * numProj - 1) =
     arma::linspace< arma::Col<size_t> >(0, numProj - 1, numProj);
 
-  // Sort all three vectors so smaller scoring perturbations are first.
-  arma::uvec sortidx = arma::sort_index(scores);
-  scores = scores(sortidx);
-  actions = actions(sortidx);
-  positions = positions(sortidx);
-
-  // From LSHSearch::GetAdditionalProbingBins. TODO: Modularize?
-
   // Perturbation sets (A) mark with 1 the (score, action, dimension) positions
   // included in a given perturbation vector. Other spaces are 0.
   std::vector<bool> Ao(2 * numProj);
@@ -415,13 +541,11 @@ void LSHModel<SortPolicy, ObjectiveFunction>::GenerateTemplateSequence(
   // Start by adding the lowest scoring set to the minheap.
   minHeap.push( std::make_pair(PerturbationScore(Ao, scores), 0) );
 
-  // Loop invariable: after pvec iterations, additionalProbingBins contains pvec
-  // valid codes of the lowest-scoring bins (bins most likely to contain
-  // neighbors of the query).
-
   // Allocate 1 column per perturbed "code".
-  this->templateSequence.zeros(numProj, numProbes);
-  for (size_t pvec = 0; pvec < numProbes; ++pvec)
+  templateSequence.zeros(numProj, numProbes);
+
+  // Column 0 is all 0s. Fill columns 1:numProbes using Lv's algorithm.
+  for (size_t pvec = 1; pvec < numProbes; ++pvec)
   {
     std::vector<bool> Ai;
     do
@@ -464,31 +588,6 @@ void LSHModel<SortPolicy, ObjectiveFunction>::GenerateTemplateSequence(
   }
 }
 
-// Fit a curve to the data provided.
-template<typename SortPolicy, typename ObjectiveFunction>
-double LSHModel<SortPolicy, ObjectiveFunction>::DistanceStatisticPredictor::Train(
-    const arma::Col<size_t>& inputSize,
-    const arma::Col<size_t>& kValues,
-    const arma::mat& statistic)
-{
-  // Objective function for fitting the E(x, k) curve to the statistic.
-  ObjectiveFunction f(inputSize, kValues, statistic);
-
-  // Optimizer. Use L_BFGS (TODO: Make this a template parameter?)
-  mlpack::optimization::L_BFGS<ObjectiveFunction> opt(f);
-
-  // Get an initial point from the optimizer.
-  arma::mat currentPoint = f.GetInitialPoint();
-  // Silence debug output of L_BFGS (TODO: remove)
-  double result = opt.Optimize(currentPoint);
-
-  // Optimizer is done - set alpha, beta, gamma.
-  this->alpha = currentPoint(0, 0);
-  this->beta = currentPoint(1, 0);
-  this->gamma = currentPoint(2, 0);
-
-  return result;
-}
 
 // Serialize the object and save to a file.
 template <typename SortPolicy, typename ObjectiveFunction>
diff --git a/src/mlpack/methods/lsh/lshmodel_main.cpp b/src/mlpack/methods/lsh/lshmodel_main.cpp
index 00cd2d03e57..20b9f0ee4f6 100644
--- a/src/mlpack/methods/lsh/lshmodel_main.cpp
+++ b/src/mlpack/methods/lsh/lshmodel_main.cpp
@@ -20,10 +20,13 @@ int main(int argc, char* argv[])
   size_t d = 10;
   size_t k = 5;
   double sampleSize = 0.25;
-  double minRecall = 0.4;
+  double recall, selectivity;
   arma::mat rdata(d, N, arma::fill::randu);
   LSHModel<> model(rdata, sampleSize, k);
-  model.Predict(N, k, minRecall);
+  model.Predict(N, k, 16, 4, 4, 1.0, recall, selectivity);
+
+  Log::Info << "Model predicts " << recall*100 << "\% recall and "
+    << selectivity*100 << "\% selectivity." << std::endl;
 
   arma::mat qdata(d, 1, arma::fill::randu);
   arma::Mat<size_t> neighbors;

From a73b46893eeaae340553de1e561e485424708fc3 Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Sun, 21 Aug 2016 14:58:53 +0100
Subject: [PATCH 11/18] First working version (needs debugging)

---
 .../lsh/distance_statistic_predictor.hpp      |  2 +-
 src/mlpack/methods/lsh/lshmodel.hpp           | 95 ++++++++++++++++---
 src/mlpack/methods/lsh/lshmodel_impl.hpp      | 73 +++++++++++++-
 3 files changed, 152 insertions(+), 18 deletions(-)

diff --git a/src/mlpack/methods/lsh/distance_statistic_predictor.hpp b/src/mlpack/methods/lsh/distance_statistic_predictor.hpp
index da1caf5a699..fc6b34a4b56 100644
--- a/src/mlpack/methods/lsh/distance_statistic_predictor.hpp
+++ b/src/mlpack/methods/lsh/distance_statistic_predictor.hpp
@@ -104,7 +104,7 @@ double DistanceStatisticPredictor<ObjectiveFunction>::Train(
   // Objective function for fitting the E(x, k) curve to the statistic.
   ObjectiveFunction f(inputSize, kValues, statistic);
 
-  // Optimizer. Use L_BFGS (TODO: Make this a template parameter?)
+  // Optimizer. Use L_BFGS
   mlpack::optimization::L_BFGS<ObjectiveFunction> opt(f);
 
   // Get an initial point from the optimizer.
diff --git a/src/mlpack/methods/lsh/lshmodel.hpp b/src/mlpack/methods/lsh/lshmodel.hpp
index 1c839d6ebba..5c250404bb7 100644
--- a/src/mlpack/methods/lsh/lshmodel.hpp
+++ b/src/mlpack/methods/lsh/lshmodel.hpp
@@ -88,7 +88,7 @@ class LSHModel
     * estimate parameters of the dataset. The estimated parameters are:
     *   * Arithmetic mean of pairwise distances of random points in the sample.
     *   * Geometric mean for the pairwise distnaces
-    *   * Arithmetic mean of distance random point to its k-th nearest neighbor 
+    *   * Arithmetic mean of distance random point to its k-th nearest neighbor
     *       as a function of |N|, the number of points.
     *   * Geometric mean of the same distance.
     *
@@ -104,7 +104,7 @@ class LSHModel
     * @param maxKValue The maximum number of nearest neighbors for each query to
     *     train for.
     */
-   void Train(const arma::mat& referenceSet, 
+   void Train(const arma::mat& referenceSet,
               const double sampleRate = 0.1,
               const size_t maxKValue = 32);
 
@@ -120,8 +120,8 @@ class LSHModel
     * @param k The number of k-nearest neighbors LSH must find.
     * @param minRecall The minimum acceptable recall we want to tune for.
     */
-   void Predict(const size_t datasetSize, 
-                const size_t k, 
+   void Predict(const size_t datasetSize,
+                const size_t k,
                 const size_t numTables,
                 const size_t numProj,
                 const size_t numProbes,
@@ -219,7 +219,7 @@ class LSHModel
     *     want to compute the template perturbation sequence.
     * @param numProbes The number of probes to generate.
     */
-   void GenerateTemplateSequence(size_t numProj, 
+   void GenerateTemplateSequence(size_t numProj,
                                  size_t numProbes);
 
    /**
@@ -239,8 +239,8 @@ class LSHModel
    double Rho(double chi,
               double hashWidth,
               size_t numTables,
-              size_t numProj, 
-              size_t numProbes);
+              size_t numProj,
+              size_t numProbes) const;
    /**
     * This is a helper function that is called by Rho() and returns the inner
     * value of the product used in the calculation of the probability that Rho
@@ -252,11 +252,80 @@ class LSHModel
     * @param proj The projection we evaluate for ( 0 <= proj < numProj).
     * @param numProj The total number of projections.
     */
-   inline double SameBucketProbability(double chi, 
-                                       double hashWidth, 
+   inline double SameBucketProbability(double chi,
+                                       double hashWidth,
                                        short delta,
                                        size_t proj,
-                                       size_t numProj);
+                                       size_t numProj) const;
+
+   /**
+    * This function calculates the recall of LSH for a given set of parameters.
+    * It uses the function
+    *
+    * r = \frac{1}{K} \sum_{1}^{K} \int_{0}^{\infty}(Rho(\sqrt{x}) * f_k(x)) dx
+    *
+    * as proposed in the paper.
+    *
+    */
+   double Recall(size_t maxK,
+                 size_t numTables,
+                 size_t numProj,
+                 size_t numProbes,
+                 double hashWidth);
+
+   /**
+    * This function calculates the selectivity of LSH for a given set of parameters.
+    * It uses the function
+    *
+    * s = \int_{0}^{\infty}(Rho(\sqrt{x}) * f(x)) dx
+    *
+    * as proposed in the paper.
+    *
+    */
+   double Selectivity(size_t numTables,
+                      size_t numProj,
+                      size_t numProbes,
+                      double hashWidth);
+
+   /**
+    * Helper class for boost::integration.
+    */
+   class IntegralObjective
+   {
+    public:
+     // Initialize everything.
+     IntegralObjective(const size_t k, 
+                       const size_t numTables,
+                       const size_t numProj,
+                       const size_t numProbes,
+                       const double hashWidth,
+                       const mlpack::distribution::GammaDistribution* gamma,
+                       const LSHModel* model)
+     : k(k), numTables(numTables), numProj(numProj), 
+     numProbes(numProbes), hashWidth(hashWidth), gamma(gamma), model(model)
+     { /* do nothing */};
+
+     ~IntegralObjective() { };
+
+     // Use as function with the operator () and one argument.
+     double operator()(const double& chi) const
+     {
+       return 
+         (model->Rho(std::sqrt(chi), hashWidth, numTables, numProj, numProbes)) 
+         * (gamma->Probability(chi, k));
+     }
+
+    private:  
+     const size_t k;
+     const size_t numTables;
+     const size_t numProj;
+     const size_t numProbes;
+     const double hashWidth;
+
+     const mlpack::distribution::GammaDistribution* gamma;
+     const LSHModel* model;
+
+   };
 
    /**
     * Function that fits two DistanceStatisticPredictors - one
@@ -271,13 +340,13 @@ class LSHModel
     * @param Gk The geometric mean of the squared distances of a point and its
     *      k-nearest neighbor. One column per k.
     */
-   void ApproximateKNNStatistics(const arma::Col<size_t>& referenceSizes, 
+   void ApproximateKNNStatistics(const arma::Col<size_t>& referenceSizes,
                                  const arma::Col<size_t>& kValues,
-                                 const arma::mat& Ek, 
+                                 const arma::mat& Ek,
                                  const arma::mat& Gk);
 
 
-   /** 
+   /**
     * Matrix that stores, in each column, the "direction" of the perturbation:
     * 0 means no perturbation on that dimension, -1 means reduce dimension value
     * by 1, and +1 means increase dimension value by 1.
diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp
index 6ea549c9700..400d46538e4 100644
--- a/src/mlpack/methods/lsh/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp
@@ -212,10 +212,75 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
   // Step 8. Use formulas (19) and (20) from the paper to predict recall and
   // selectivity, using LSHModel::Rho() and the distribution functions of the
   // gammas we fit back in Step 6.
-  predictedRecall = 0.5;
-  predictedSelect = 0.5;
+  predictedRecall = Recall(k, numTables, numProj, numProbes, hashWidth);
+  predictedSelect = Selectivity(numTables, numProj, numProbes, hashWidth);
 }
 
+// Uses paper's formula (19) to predict recall.
+template <typename SortPolicy, typename ObjectiveFunction>
+double LSHModel<SortPolicy, ObjectiveFunction>::Recall(size_t maxK,
+                                                       size_t numTables,
+                                                       size_t numProj,
+                                                       size_t numProbes,
+                                                       double hashWidth)
+{
+  double recall = 0;
+
+  // Loop over k values, accumulating the probabilities. Then take average.
+  // k starts from one because distancesDistribution(0) is the "simple" pairwise
+  // distances distribution.
+  for (size_t k = 1; k < maxK + 1; k++)
+  {
+    // Create a helper object for this value of k.
+    IntegralObjective f(k, numTables, numProj, numProbes, hashWidth,
+       &distancesDistribution, this);
+
+    // TODO: change with boost integration.
+    double from = 0;
+    double to = 1000;
+    double step = 0.01;
+    double integralSum = 0;
+    for (double i = from+step; i < to; i+=step)
+    {
+      double temp = f(i);
+      if (temp > 0)
+        integralSum += temp; // Use as function thanks to operator().
+      else
+        break; // Gamma distribution == 0 means we're past the tail.
+    }
+    recall += integralSum * step ;
+  }
+  return recall / double(maxK);
+}
+
+// Uses paper's formula (20) to compute selectivity
+template <typename SortPolicy, typename ObjectiveFunction>
+double LSHModel<SortPolicy, ObjectiveFunction>::Selectivity(size_t numTables,
+                                                            size_t numProj,
+                                                            size_t numProbes,
+                                                            double hashWidth)
+{
+
+  // Create a helper object for k = 0 (pairwise distances).
+  IntegralObjective f(0, numTables, numProj, numProbes, hashWidth,
+      &distancesDistribution, this);
+
+  // TODO: change with boost integration.
+  double from = 0;
+  double to = 1000;
+  double step = 0.01;
+  double integralSum = 0;
+  for (double i = from+step; i < to; i+=step)
+  {
+    double temp = f(i);
+    if (temp > 0)
+      integralSum += temp; // Use as function thanks to operator().
+    else
+      break; // Gamma distribution == 0 means we're past the tail.
+  }
+
+  return integralSum * step ;
+}
 
 /* NOTE: My interpretation of the paper would result in this code, but LSHKIT's
  * implementation is different. I'm commenting this out to try their way, and I
@@ -294,7 +359,7 @@ double LSHModel<SortPolicy, ObjectiveFunction>::Rho(double chi,
                                                     double hashWidth,
                                                     size_t numTables,
                                                     size_t numProj,
-                                                    size_t numProbes)
+                                                    size_t numProbes) const
 {
   double rho = 0;
 
@@ -316,7 +381,7 @@ double LSHModel<SortPolicy, ObjectiveFunction>::Rho(double chi,
 template <typename SortPolicy, typename ObjectiveFunction>
 double LSHModel<SortPolicy, ObjectiveFunction>::
 SameBucketProbability(double chi, double hashWidth, short delta, size_t proj,
-                      size_t numProj)
+                      size_t numProj) const
 {
   boost::math::normal_distribution<> phi;
   if (delta == 0)

From dd2bdf70e4047b0da4a4e178fb11c3300630077a Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Tue, 23 Aug 2016 16:53:50 +0300
Subject: [PATCH 12/18] Adds command line interface options to lshmodel

---
 src/mlpack/methods/lsh/lshmodel_impl.hpp | 72 +-----------------------
 src/mlpack/methods/lsh/lshmodel_main.cpp | 65 +++++++++++++++------
 2 files changed, 50 insertions(+), 87 deletions(-)

diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp
index 400d46538e4..28cb6741564 100644
--- a/src/mlpack/methods/lsh/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp
@@ -238,7 +238,7 @@ double LSHModel<SortPolicy, ObjectiveFunction>::Recall(size_t maxK,
     // TODO: change with boost integration.
     double from = 0;
     double to = 1000;
-    double step = 0.01;
+    double step = 0.001;
     double integralSum = 0;
     for (double i = from+step; i < to; i+=step)
     {
@@ -268,7 +268,7 @@ double LSHModel<SortPolicy, ObjectiveFunction>::Selectivity(size_t numTables,
   // TODO: change with boost integration.
   double from = 0;
   double to = 1000;
-  double step = 0.01;
+  double step = 0.001;
   double integralSum = 0;
   for (double i = from+step; i < to; i+=step)
   {
@@ -282,74 +282,6 @@ double LSHModel<SortPolicy, ObjectiveFunction>::Selectivity(size_t numTables,
   return integralSum * step ;
 }
 
-/* NOTE: My interpretation of the paper would result in this code, but LSHKIT's
- * implementation is different. I'm commenting this out to try their way, and I
- * might go back to this if I see both work the same.
-
-// Probability of two points being neighbors if they are at distance chi.
-template <typename SortPolicy, typename ObjectiveFunction>
-double LSHModel<SortPolicy, ObjectiveFunction>::Rho(double chi,
-                                                    double hashWidth,
-                                                    size_t numTables,
-                                                    size_t numProj,
-                                                    size_t numProbes)
-{
-  // Calculate the formula:
-  // 1 - {Prod{1 - Prod{same_bin_probability}}}^numTables, where:
-  // * same_bin_probability is calculated with the Value() function.
-  // * Prod{same_bin_probability} is stored in product.
-  // * Prod{1 - Prod{same_bin_probability}} is stored in rho.
-
-  double rho = 1;
-
-  // Row-major loop :(. TODO: Refactor to make column-major.
-  for (size_t proj = 0; proj < numProj; ++proj)
-  {
-    double product = 1;
-    for (size_t probe = 0; probe < numProbes; ++probe)
-    {
-      // Use perturbation value (proj, probe), i.e. \delta_{\mu, \tau}
-      product *= Value(chi, hashWidth, templateSequence(proj, probe), numProj);
-    }
-
-    rho *= (1 - product);
-  }
-
-  return 1 - std::pow(rho, numTables);
-}
-
-// Probability of two points being neighbors if they are at distance chi.
-template <typename SortPolicy, typename ObjectiveFunction>
-double LSHModel<SortPolicy, ObjectiveFunction>::SameBucketProbability(double chi,
-                      double hashWidth,
-                      short delta,
-                      size_t proj,
-                      size_t numProj)
-{
-  if (delta == 0)
-  {
-    // No perturbation - probability of two queries sharing the same bin.
-    // Use the "default" normal distribution with mean = 0, sd = 1.
-    boost::math::normal_distribution phi;
-    return 2 * phi.pdf(hashWidth / chi) - 1
-      + std::sqrt(2 / M_PI) 
-      * (std::exp(-pow((hashWidth / chi), 2) / 2.0 - 1.0)) / (hashWidth / chi);
-  }
-  else
-  {
-    // +1/-1 perturbation - probability of two queries being in adjacent bins.
-    double deltaI = (proj + 1.0) / (2.0 * (numProj + 2.0));
-    
-    // Negative perturbation - flip deltaI.
-    if (delta == -1)
-      deltaI = 1 - deltaI;
-
-    boost::math::normal_distribution phi(-delta, chi);
-    return phi.cdf(hashWidth) - phi.cdf(0);
-  }
-}
-*/
-
 /*
  * Based on the LSHKIT implementation, not my understanding of the paper.
  */
diff --git a/src/mlpack/methods/lsh/lshmodel_main.cpp b/src/mlpack/methods/lsh/lshmodel_main.cpp
index 20b9f0ee4f6..88e79410bfa 100644
--- a/src/mlpack/methods/lsh/lshmodel_main.cpp
+++ b/src/mlpack/methods/lsh/lshmodel_main.cpp
@@ -3,36 +3,67 @@
 
 #include "lshmodel.hpp"
 
+using std::string; using std::endl; using std::cout;
 using namespace mlpack;
 using namespace mlpack::neighbor;
 
-PROGRAM_INFO("LSH Model (TODO: Complete this)", "");
+PROGRAM_INFO("LSH Modeling and Tuning", 
+    "This program can help tune parameters for the LSH algorithm for"
+    " approximate nearest neighbor search. Currently, the only option is to"
+    " specify a number of the four parameters (numTables, numProj, numProbes,"
+    " hashWidth) and receive an estimate of LSH's recall and selectivity for a"
+    " given dataset."
+    );
 
 PARAM_STRING_IN("reference_file", "File containing the dataset", "r", "");
-PARAM_STRING_OUT("output_model_file", "File to save trained LSH model to", "m");
+PARAM_DOUBLE_IN("sample_percentage", "Sample size percentage. Must be in (0, 1]", "p", 0.0) 
+
+PARAM_INT_IN("neighbors", "The number of nearest neighbors LSH will search for", "k", 1);
+PARAM_INT_IN("tables", "The number of tables for LSH", "L", 30);
+PARAM_INT_IN("projections", "The number of projections per table for LSH", "K", 10);
+PARAM_INT_IN("probes", "The number of probes for multiprobe LSH", "T", 0);
+PARAM_DOUBLE_IN("hash_width", "The hash width for the first level hashing", "H", 1.0);
+
+//PARAM_STRING_OUT("output_model_file", "File to save trained LSH model to", "m");
 
 int main(int argc, char* argv[])
 {
   CLI::ParseCommandLine(argc, argv);
 
-  // Generate a random point set.
-  size_t N = 5000;
-  size_t d = 10;
-  size_t k = 5;
-  double sampleSize = 0.25;
+  // If no input file was specified, die here.
+  if (!CLI::HasParam("reference_file"))
+    Log::Fatal << "You need to specify the reference file." << endl;
+  // Read input file name
+  string rfile = CLI::GetParam<string>("reference_file");
+  // Attempt to read file.
+  arma::mat rdata;
+  data::Load(rfile, rdata, true); // true: if you can't open file, die.
+  size_t N = rdata.n_cols; // Dataset size.
+
+  // Parse rest of command line input.
+  size_t k = CLI::GetParam<int>("neighbors");
+  size_t numTables = CLI::GetParam<int>("tables");
+  size_t numProj = CLI::GetParam<int>("projections");
+  size_t numProbes = CLI::GetParam<int>("probes");
+  double hashWidth = CLI::GetParam<double>("hash_width");
+  double sampleSize = CLI::GetParam<double>("sample_percentage");
+  if (sampleSize == 0.0)
+    Log::Fatal << "You need to specify the sampling percentage." << endl;
+
+  Log::Info <<
+    "Tuning LSH for" << std::endl
+    <<"\t numTables = " << numTables << std::endl
+    <<"\t numProj = " << numProj << std::endl
+    <<"\t numProbes = " << numProbes << std::endl
+    <<"\t hashWidth = " << hashWidth << std::endl;
+
   double recall, selectivity;
-  arma::mat rdata(d, N, arma::fill::randu);
-  LSHModel<> model(rdata, sampleSize, k);
-  model.Predict(N, k, 16, 4, 4, 1.0, recall, selectivity);
 
-  Log::Info << "Model predicts " << recall*100 << "\% recall and "
-    << selectivity*100 << "\% selectivity." << std::endl;
+  LSHModel<> model(rdata, sampleSize, k);
+  model.Predict(N, k, numTables, numProj, numProbes, hashWidth, recall, selectivity);
 
-  arma::mat qdata(d, 1, arma::fill::randu);
-  arma::Mat<size_t> neighbors;
-  arma::mat distances;
-  LSHSearch<> lsh = model.LSHObject(1, 1, 1.0, 99901, 500);
-  lsh.Search(qdata, 1, neighbors, distances);
+  cout << "Model predicts " << recall*100 << "\% recall and "
+    << selectivity*100 << "\% selectivity." << endl;
 
   return 0;
 }

From 57c9d5e634d7d3d7e2ca1618353fe37d9e23b34a Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Tue, 23 Aug 2016 17:49:44 +0300
Subject: [PATCH 13/18] Changes way kNN samples are generated

---
 src/mlpack/methods/lsh/lshmodel_impl.hpp | 44 ++++++++++++++----------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp
index 28cb6741564..dbdcaebe943 100644
--- a/src/mlpack/methods/lsh/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp
@@ -104,36 +104,43 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
   // through exponentiating the mean of the logarithms of x:
   // exp(mean(log(x))) = geometricmean(x).
 
-  // Number of samples to create for modeling the Gamma Distributions
-  size_t regressionExamples = 50; // TODO: parameter?
-
-  // Number of points to use as queries.
-  size_t numAnchors = (size_t) std::round(0.1 * numSamples);
-  arma::mat queryMat = sampleSet.cols(0, numAnchors - 1);
-  // Evenly spaced sample sizes.
-  arma::Col<size_t> referenceSizes = arma::conv_to< arma::Col<size_t> >::from(
-    arma::linspace(numAnchors, numSamples - numAnchors - 1,
-      regressionExamples));
+  // Number of points to use as queries. Use 10% of sample.
+  double anchorsSample = 0.1;
+  size_t numAnchors = (size_t) std::round(anchorsSample * numSamples);
 
   // Statistics - Arithmetic and geometric means for growing reference set.
   // Compute one of each for each k.
+  size_t regressionExamples = size_t(
+      std::round((1.0 - anchorsSample) / anchorsSample));
   arma::mat Ek(regressionExamples, k);
   arma::mat Gk(regressionExamples, k);
 
-  // For each referenceSize, calculate the kNN of the anchors
+  // For each reference size, calculate the kNN of the anchors. Divide reference
+  // set into equal blocks (block 1 is anchors). In repetition 1, use block 2 as
+  // reference set, in repetition 2, blocks 2 and 3, and so on.
+  size_t refSetStart = numAnchors;
+  size_t refSetEnd = numAnchors;
+  arma::mat queryMat = sampleSet.cols(0, numAnchors - 1);
+  arma::Col<size_t> referenceSizes(regressionExamples);
+
   Log::Info.ignoreInput = true; // Ignore kNN output.
+  // TODO: Since we've already computed this, avoid calling kNN?
   for (size_t i = 0; i < regressionExamples; ++i)
   {
-    // TODO: Since we've already computed this, avoid calling kNN?
+    refSetEnd += refSetStart - 1;
+
+    cout << "Neighbors "<< refSetStart <<":"<<refSetEnd<<endl;
 
     // Reference set for kNN
-    arma::mat refMat = sampleSet.cols(numAnchors,
-        numAnchors + referenceSizes(i));
+    arma::mat refMat = sampleSet.cols(refSetStart, refSetEnd);
+    referenceSizes(i) = refMat.n_cols;
 
     arma::Mat<size_t> neighbors; // Not going to be used but required.
     arma::mat kNNDistances; // What we need.
     KNN naive(refMat, true); // true: train and use naive kNN.
     naive.Search(queryMat, k, neighbors, kNNDistances);
+
+    // Store the squared distances (what we need).
     kNNDistances = arma::pow(kNNDistances, 2);
 
     // Compute Arithmetic and Geometric mean of the distances.
@@ -206,14 +213,15 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
   Timer::Stop("fitting_distributions");
 
   // Step 7. Generate the Template Probing Sequence using the maximum number of
-  // projections and the maximum number of probes.
-  GenerateTemplateSequence(numProj, numProbes);
+  // projections and the maximum number of probes. +1 because 0 additional
+  // probes means 1 probe total.
+  GenerateTemplateSequence(numProj, numProbes + 1);
 
   // Step 8. Use formulas (19) and (20) from the paper to predict recall and
   // selectivity, using LSHModel::Rho() and the distribution functions of the
   // gammas we fit back in Step 6.
-  predictedRecall = Recall(k, numTables, numProj, numProbes, hashWidth);
-  predictedSelect = Selectivity(numTables, numProj, numProbes, hashWidth);
+  predictedRecall = Recall(k, numTables, numProj, numProbes + 1, hashWidth);
+  predictedSelect = Selectivity(numTables, numProj, numProbes + 1, hashWidth);
 }
 
 // Uses paper's formula (19) to predict recall.

From a0626a8c41fedc60ec255ef2939a519dfac5b83a Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Wed, 24 Aug 2016 19:14:53 +0300
Subject: [PATCH 14/18] Prevents log(0) which causes errors. Infinite loop
 still happens occasionaly

---
 src/mlpack/methods/lsh/lshmodel_impl.hpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp
index dbdcaebe943..c4c2f3a4ea5 100644
--- a/src/mlpack/methods/lsh/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp
@@ -129,8 +129,6 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
   {
     refSetEnd += refSetStart - 1;
 
-    cout << "Neighbors "<< refSetStart <<":"<<refSetEnd<<endl;
-
     // Reference set for kNN
     arma::mat refMat = sampleSet.cols(refSetStart, refSetEnd);
     referenceSizes(i) = refMat.n_cols;
@@ -140,6 +138,9 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
     KNN naive(refMat, true); // true: train and use naive kNN.
     naive.Search(queryMat, k, neighbors, kNNDistances);
 
+    // If identical points are found, disregard their distance to avoid log(0).
+    kNNDistances = kNNDistances.cols(arma::find(kNNDistances > 0));
+
     // Store the squared distances (what we need).
     kNNDistances = arma::pow(kNNDistances, 2);
 
@@ -147,6 +148,8 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
     Ek.row(i) = arma::mean(kNNDistances.t());
     Gk.row(i) = arma::exp(arma::mean(arma::log(kNNDistances.t()), 0));
   }
+  cout << Ek << endl;
+  cout << Gk << endl;
   Log::Info.ignoreInput = false; // Keep giving normal output.
 
   // Step 5. Model the arithmetic and geometric mean according to the paper.

From cc1b6910d9ed28b5ab8d1911b9af4642ca8475e7 Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Tue, 30 Aug 2016 10:09:48 +0300
Subject: [PATCH 15/18] Removes parameterized objective function constructor

---
 src/mlpack/methods/lsh/objectivefunction.hpp | 22 ++++----------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/src/mlpack/methods/lsh/objectivefunction.hpp b/src/mlpack/methods/lsh/objectivefunction.hpp
index 2aa60ab8054..6ddc51c8177 100644
--- a/src/mlpack/methods/lsh/objectivefunction.hpp
+++ b/src/mlpack/methods/lsh/objectivefunction.hpp
@@ -28,20 +28,6 @@ class DefaultObjectiveFunction
     //! Default constructor - do nothing.
     DefaultObjectiveFunction() { };
 
-    /**
-     * Parameterized constructor.
-     *
-     * @param xData Vector of x - the sizes of the reference set when performing
-     *    kNN.
-     * @param kData Vector of k - the kth nearest neighbor for which we
-     *    calculated the statistic.
-     * @param yData Matrix of y, one for each (x, k) value.
-     */
-    DefaultObjectiveFunction(const arma::Col<size_t>& xData, 
-                             const arma::Col<size_t>& kData, 
-                             const arma::mat& yData)
-      : xData(&xData), kData(&kData), yData(&yData)  { };
-
     //! Return the number of functions
     size_t NumFunctions(void) const { return yData->n_elem; }
 
@@ -86,10 +72,10 @@ class DefaultObjectiveFunction
 double DefaultObjectiveFunction::Evaluate(const arma::mat& coordinates) const
 {
   // Use extra variables to make code readable.
-  double alpha = coordinates(0, 0);
-  double beta = coordinates(1, 0);
-  double gamma = coordinates(2, 0);
-  double M = (double) NumFunctions();
+  const double alpha = coordinates(0, 0);
+  const double beta = coordinates(1, 0);
+  const double gamma = coordinates(2, 0);
+  const double M = (double) NumFunctions();
 
   // Sum the squared error for each element in yData.
   double sum = 0;

From fb308e80d00990f7ee697cbb26e74af045aa4bf5 Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Tue, 30 Aug 2016 13:59:34 +0300
Subject: [PATCH 16/18] Solves NaN values issue

---
 src/mlpack/methods/lsh/lshmodel_impl.hpp     | 39 +++++++++++++++-----
 src/mlpack/methods/lsh/objectivefunction.hpp | 22 +++++++++--
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp
index c4c2f3a4ea5..1a66377e03f 100644
--- a/src/mlpack/methods/lsh/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp
@@ -84,6 +84,21 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
     for (size_t j = i + 1; j < numSamples; ++j)
       distances(d++) = metric::EuclideanDistance::Evaluate(
           sampleSet.unsafe_col(i), sampleSet.unsafe_col(j));
+  
+  // We need to take the logarithm of these distances, so replace the 0s with
+  // very small values.
+  // Find smallest value
+  double smallest = DBL_MAX;
+  for (size_t i = 0; i < d; ++i)
+    if (distances(i) < smallest && distances(i) > 0)
+      smallest = distances(i);
+
+  // Replace 0s with fraction of smallest value.
+  for (size_t i = 0; i < d; ++i)
+    if (distances(i) == 0)
+      distances(i) = 1e-5 * smallest;
+
+
   Log::Info << "Computed " << d << " pointwise distances." << std::endl;
   Timer::Stop("pairwise_distances");
 
@@ -112,6 +127,8 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
   // Compute one of each for each k.
   size_t regressionExamples = size_t(
       std::round((1.0 - anchorsSample) / anchorsSample));
+
+  // store statistics for the distances.
   arma::mat Ek(regressionExamples, k);
   arma::mat Gk(regressionExamples, k);
 
@@ -138,18 +155,19 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
     KNN naive(refMat, true); // true: train and use naive kNN.
     naive.Search(queryMat, k, neighbors, kNNDistances);
 
-    // If identical points are found, disregard their distance to avoid log(0).
-    kNNDistances = kNNDistances.cols(arma::find(kNNDistances > 0));
+    // Replace 0s again.
+    for (size_t c = 0; c < kNNDistances.n_cols; ++c)
+      for (size_t r = 0; r < kNNDistances.n_rows; ++r)
+        if (kNNDistances(r, c) == 0)
+          kNNDistances(r, c) = 1e-5 * smallest;
 
     // Store the squared distances (what we need).
     kNNDistances = arma::pow(kNNDistances, 2);
 
     // Compute Arithmetic and Geometric mean of the distances.
-    Ek.row(i) = arma::mean(kNNDistances.t());
-    Gk.row(i) = arma::exp(arma::mean(arma::log(kNNDistances.t()), 0));
+    Ek.row(i) = arma::mean(kNNDistances, 1).t();
+    Gk.row(i) = arma::exp(arma::mean(arma::log(kNNDistances), 1)).t();
   }
-  cout << Ek << endl;
-  cout << Gk << endl;
   Log::Info.ignoreInput = false; // Keep giving normal output.
 
   // Step 5. Model the arithmetic and geometric mean according to the paper.
@@ -199,8 +217,9 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
   meanVec(0) = this->meanDist;
   logMeanVec(0) = this->logMeanDist;
   meanLogVec(0) = this->meanLogDist;
-  // Train gamma and put in gammaDists[0].
 
+  // Use the trained predictors (Step 5) to predict arithmetic and geometric
+  // means for each k value.
   Timer::Start("fitting_distributions");
   for (size_t i = 1; i <= k; ++i)
   {
@@ -211,13 +230,13 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Predict(const size_t datasetSize,
     // log(geometricMean) = \frac{1}{n} \sum(lnx_i) = mean(lnx) = meanLog
     meanLogVec(i) = std::log(gMeanPredictor.Predict(datasetSize, k));
   }
-  // Fit the distribution.
+  // Fit the distribution using the estimated and computed statistics.
   distancesDistribution.Train(logMeanVec, meanLogVec, meanVec);
   Timer::Stop("fitting_distributions");
 
   // Step 7. Generate the Template Probing Sequence using the maximum number of
-  // projections and the maximum number of probes. +1 because 0 additional
-  // probes means 1 probe total.
+  // projections and the maximum number of probes.
+  // +1 because 0 additional probes means 1 probe total.
   GenerateTemplateSequence(numProj, numProbes + 1);
 
   // Step 8. Use formulas (19) and (20) from the paper to predict recall and
diff --git a/src/mlpack/methods/lsh/objectivefunction.hpp b/src/mlpack/methods/lsh/objectivefunction.hpp
index 6ddc51c8177..2aa60ab8054 100644
--- a/src/mlpack/methods/lsh/objectivefunction.hpp
+++ b/src/mlpack/methods/lsh/objectivefunction.hpp
@@ -28,6 +28,20 @@ class DefaultObjectiveFunction
     //! Default constructor - do nothing.
     DefaultObjectiveFunction() { };
 
+    /**
+     * Parameterized constructor.
+     *
+     * @param xData Vector of x - the sizes of the reference set when performing
+     *    kNN.
+     * @param kData Vector of k - the kth nearest neighbor for which we
+     *    calculated the statistic.
+     * @param yData Matrix of y, one for each (x, k) value.
+     */
+    DefaultObjectiveFunction(const arma::Col<size_t>& xData, 
+                             const arma::Col<size_t>& kData, 
+                             const arma::mat& yData)
+      : xData(&xData), kData(&kData), yData(&yData)  { };
+
     //! Return the number of functions
     size_t NumFunctions(void) const { return yData->n_elem; }
 
@@ -72,10 +86,10 @@ class DefaultObjectiveFunction
 double DefaultObjectiveFunction::Evaluate(const arma::mat& coordinates) const
 {
   // Use extra variables to make code readable.
-  const double alpha = coordinates(0, 0);
-  const double beta = coordinates(1, 0);
-  const double gamma = coordinates(2, 0);
-  const double M = (double) NumFunctions();
+  double alpha = coordinates(0, 0);
+  double beta = coordinates(1, 0);
+  double gamma = coordinates(2, 0);
+  double M = (double) NumFunctions();
 
   // Sum the squared error for each element in yData.
   double sum = 0;

From f47e06938ce4b77907177aa5b3afc780a5a05b7f Mon Sep 17 00:00:00 2001
From: Ryan Curtin <ryan@ratml.org>
Date: Wed, 9 Nov 2016 13:54:03 -0500
Subject: [PATCH 17/18] (hopefully) Fix SameBucketProbability() and add some
 tests for it.

I had to mark some private methods public to test these correctly, so maybe we
should consider doing a little refactoring or redesign there, but I have not
thought much about how.
---
 src/mlpack/methods/lsh/lshmodel.hpp      |  3 +
 src/mlpack/methods/lsh/lshmodel_impl.hpp | 12 ++--
 src/mlpack/tests/lsh_test.cpp            | 79 ++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/src/mlpack/methods/lsh/lshmodel.hpp b/src/mlpack/methods/lsh/lshmodel.hpp
index 5c250404bb7..06b0340e061 100644
--- a/src/mlpack/methods/lsh/lshmodel.hpp
+++ b/src/mlpack/methods/lsh/lshmodel.hpp
@@ -219,6 +219,7 @@ class LSHModel
     *     want to compute the template perturbation sequence.
     * @param numProbes The number of probes to generate.
     */
+ public:
    void GenerateTemplateSequence(size_t numProj,
                                  size_t numProbes);
 
@@ -241,6 +242,7 @@ class LSHModel
               size_t numTables,
               size_t numProj,
               size_t numProbes) const;
+
    /**
     * This is a helper function that is called by Rho() and returns the inner
     * value of the product used in the calculation of the probability that Rho
@@ -257,6 +259,7 @@ class LSHModel
                                        short delta,
                                        size_t proj,
                                        size_t numProj) const;
+ private:
 
    /**
     * This function calculates the recall of LSH for a given set of parameters.
diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp
index 1a66377e03f..cb8e212ed0d 100644
--- a/src/mlpack/methods/lsh/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp
@@ -349,15 +349,19 @@ SameBucketProbability(double chi, double hashWidth, short delta, size_t proj,
   if (delta == 0)
   {
     // No perturbation - probability of two queries sharing the same bin.
-    return 2 * pdf(phi, hashWidth / chi) - 1
-      + std::sqrt(2 / M_PI) 
-      * (std::exp(-pow((hashWidth / chi), 2) / 2.0 - 1.0)) / (hashWidth / chi);
+    // The derivation to come to this solution is... pretty intense.  If you
+    // want to reproduce it, take equation (13) from the paper and expand it.
+    // Integrate the Gaussian PDF phi((z - x) / d), then take the expected value
+    // over z assuming a uniform distribution (so f(z) = 1/W).  After some
+    // integration and algebraic simplification, you should come to the result
+    // below.
+    return 2 * cdf(phi, hashWidth / chi) - 1.0;
   }
   else
   {
     // +1/-1 perturbation - probability of two queries being in adjacent bins.
     double deltaI = (proj + 1.0) / (2.0 * (numProj + 2.0));
-    
+
     // Negative perturbation - flip deltaI.
     if (delta == -1)
       deltaI = 1 - deltaI;
diff --git a/src/mlpack/tests/lsh_test.cpp b/src/mlpack/tests/lsh_test.cpp
index 33485fce76d..64110d92e66 100644
--- a/src/mlpack/tests/lsh_test.cpp
+++ b/src/mlpack/tests/lsh_test.cpp
@@ -10,6 +10,7 @@
 
 #include <mlpack/methods/lsh/lsh_search.hpp>
 #include <mlpack/methods/neighbor_search/neighbor_search.hpp>
+#include <mlpack/methods/lsh/lshmodel.hpp>
 
 using namespace std;
 using namespace mlpack;
@@ -831,4 +832,82 @@ BOOST_AUTO_TEST_CASE(ParallelMonochromatic)
 }
 #endif
 
+// Test that LSHModel::Rho() returns reasonable results.
+BOOST_AUTO_TEST_CASE(RhoTest)
+{
+  arma::mat data(10, 1000, arma::fill::randu);
+  LSHModel<> m(data, 0.1, 3);
+  m.GenerateTemplateSequence(5, 5);
+
+  // Two identical points should have high probability of being in the same bin.
+  for (double hw = 0.1; hw < 0.5; hw++)
+    BOOST_REQUIRE_CLOSE(m.Rho(0.0, hw, 5, 5, 5), 1.0, 1e-5);
+
+  // Two very faraway points should have very small probability of being in the
+  // same bin.
+  for (double hw = 0.1; hw <= 0.5; hw++)
+    BOOST_REQUIRE_SMALL(m.Rho(5.0, hw, 5, 5, 5), 1e-5);
+}
+
+// Test that LSHModel::SameBucketProbability() returns reasonable results when
+// delta = 0.
+BOOST_AUTO_TEST_CASE(SameBucketProbabilityDelta0Test)
+{
+  // Create a simple LSHModel.
+  arma::mat data(10, 100, arma::fill::randu);
+  LSHModel<> m(data, 0.1, 1);
+
+  // When the points are far and the hash width is small the probability should
+  // be very close to 0.
+  BOOST_REQUIRE_SMALL(m.SameBucketProbability(1e6, 1e-2, 0, 5, 10), 1e-5);
+
+  // When the points are close and the hash width is large the probability
+  // should be very close to 1.
+  BOOST_REQUIRE_CLOSE(m.SameBucketProbability(1e-2, 1e6, 0, 0, 3), 1.0, 1e-5);
+
+  // For random points, the probability should be between 0 and 1.
+  for (size_t i = 0; i < 1000; ++i)
+  {
+    const double r = math::Random();
+
+    const double p = m.SameBucketProbability(r, 0.5, 0, 0, 3);
+    BOOST_REQUIRE_GE(p, 0.0);
+    BOOST_REQUIRE_LE(p, 1.0);
+  }
+}
+
+// Test that LSHModel::SameBucketProbability() returns reasonable results when
+// delta = 1 or delta = -1.
+BOOST_AUTO_TEST_CASE(SameBucketProbabilityDelta1Test)
+{
+  // Create a simple LSHModel.
+  arma::mat data(10, 100, arma::fill::randu);
+  LSHModel<> m(data, 0.1, 1);
+
+  // When the points are very far and the hash width is small the probability
+  // should be very close to 0, regardless of delta.
+  BOOST_REQUIRE_SMALL(m.SameBucketProbability(1e6, 1e-2, 1, 5, 10), 1e-5);
+  BOOST_REQUIRE_SMALL(m.SameBucketProbability(1e6, 1e-2, -1, 5, 10), 1e-5);
+
+  // When the points are close(ish) and the hash width is large the probability
+  // should still be close to 0 because delta != 0 means the we are searching
+  // adjacent bins.
+  BOOST_REQUIRE_SMALL(m.SameBucketProbability(1e-2, 1e2, 1, 0, 3), 1e-5);
+  BOOST_REQUIRE_SMALL(m.SameBucketProbability(1e-2, 1e2, -1, 0, 3), 1e-5);
+
+  // For random points, the probability should be between 0 and 1.
+  for (size_t i = 0; i < 1000; ++i)
+  {
+    const double r = math::Random();
+
+    const double p = m.SameBucketProbability(r, 0.5, 1, 0, 3);
+    BOOST_REQUIRE_GE(p, 0.0);
+    BOOST_REQUIRE_LE(p, 1.0);
+
+    const double p2 = m.SameBucketProbability(r, 0.5, -1, 0, 3);
+    BOOST_REQUIRE_GE(p2, 0.0);
+    BOOST_REQUIRE_LE(p2, 1.0);
+  }
+}
+
 BOOST_AUTO_TEST_SUITE_END();

From f05b0e034fd8210a9dd6a94d6874e7fab2c4b145 Mon Sep 17 00:00:00 2001
From: mentekid <mentekid@gmail.com>
Date: Sun, 20 Nov 2016 10:31:30 +0000
Subject: [PATCH 18/18] Switches to NAIVE_MODE for kNN

---
 src/mlpack/methods/lsh/lshmodel_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mlpack/methods/lsh/lshmodel_impl.hpp b/src/mlpack/methods/lsh/lshmodel_impl.hpp
index cb8e212ed0d..3c4a6680ca9 100644
--- a/src/mlpack/methods/lsh/lshmodel_impl.hpp
+++ b/src/mlpack/methods/lsh/lshmodel_impl.hpp
@@ -152,7 +152,7 @@ void LSHModel<SortPolicy, ObjectiveFunction>::Train(
 
     arma::Mat<size_t> neighbors; // Not going to be used but required.
     arma::mat kNNDistances; // What we need.
-    KNN naive(refMat, true); // true: train and use naive kNN.
+    KNN naive(refMat, NAIVE_MODE); // true: train and use naive kNN.
     naive.Search(queryMat, k, neighbors, kNNDistances);
 
     // Replace 0s again.