mlpack · zoq · Jan 26, 2021 · Jan 21, 2021 · Jan 21, 2021 · Jan 21, 2021
diff --git a/HISTORY.md b/HISTORY.md
@@ -13,6 +13,8 @@
 
   * Add finalizers to Julia binding model types to fix memory handling (#2756).
 
+  * Add k-means++ initialization strategy (#2813).
+
 ### mlpack 3.4.2
 ###### 2020-10-26
   * Added Mean Absolute Percentage Error.

diff --git a/src/mlpack/core/util/param_checks.hpp b/src/mlpack/core/util/param_checks.hpp
@@ -43,11 +43,14 @@ namespace util {
  * @param fatal If true, output goes to Log::Fatal instead of Log::Warn and an
  *     exception is thrown.
  * @param customErrorMessage Error message to append.
+ * @param allowNone If true, then no error message will be thrown if none of the
+ *     parameters in the constraints were passed.
  */
 void RequireOnlyOnePassed(
     const std::vector<std::string>& constraints,
     const bool fatal = true,
-    const std::string& customErrorMessage = "");
+    const std::string& customErrorMessage = "",
+    const bool allowNone = false);
 
 /**
  * Require that at least one of the given parameters in the constraints set was

diff --git a/src/mlpack/core/util/param_checks_impl.hpp b/src/mlpack/core/util/param_checks_impl.hpp
@@ -21,7 +21,8 @@ namespace util {
 inline void RequireOnlyOnePassed(
     const std::vector<std::string>& constraints,
     const bool fatal,
-    const std::string& errorMessage)
+    const std::string& errorMessage,
+    const bool allowNone)
 {
   if (BINDING_IGNORE_CHECK(constraints))
     return;
@@ -57,7 +58,7 @@ inline void RequireOnlyOnePassed(
       stream << "; " << errorMessage;
     stream << "!" << std::endl;
   }
-  else if (set == 0)
+  else if (set == 0 && !allowNone)
   {
     stream << (fatal ? "Must " : "Should ");
 

diff --git a/src/mlpack/methods/kmeans/CMakeLists.txt b/src/mlpack/methods/kmeans/CMakeLists.txt
@@ -14,6 +14,7 @@ set(SOURCES
   kill_empty_clusters.hpp
   kmeans.hpp
   kmeans_impl.hpp
+  kmeans_plus_plus_initialization.hpp
   max_variance_new_cluster.hpp
   max_variance_new_cluster_impl.hpp
   naive_kmeans.hpp

diff --git a/src/mlpack/methods/kmeans/kmeans_main.cpp b/src/mlpack/methods/kmeans/kmeans_main.cpp
@@ -17,6 +17,7 @@
 #include "allow_empty_clusters.hpp"
 #include "kill_empty_clusters.hpp"
 #include "refined_start.hpp"
+#include "kmeans_plus_plus_initialization.hpp"
 #include "elkan_kmeans.hpp"
 #include "hamerly_kmeans.hpp"
 #include "pelleg_moore_kmeans.hpp"
@@ -44,14 +45,17 @@ BINDING_LONG_DESC(
     " the point furthest from the centroid of the cluster with maximum variance"
     " is taken to fill that cluster."
     "\n\n"
-    "Optionally, the Bradley and Fayyad approach (\"Refining initial points for"
-    " k-means clustering\", 1998) can be used to select initial points by "
-    "specifying the " + PRINT_PARAM_STRING("refined_start") + " parameter.  "
-    "This approach works by taking random samplings of the dataset; to specify "
-    "the number of samplings, the " + PRINT_PARAM_STRING("samplings") +
-    " parameter is used, and to specify the percentage of the dataset to be "
-    "used in each sample, the " + PRINT_PARAM_STRING("percentage") +
-    " parameter is used (it should be a value between 0.0 and 1.0)."
+    "Optionally, the strategy to choose initial centroids can be specified.  "
+    "The k-means++ algorithm can be used to choose initial centroids with "
+    "the " + PRINT_PARAM_STRING("kmeans_plus_plus") + " parameter.  The "
+    "Bradley and Fayyad approach (\"Refining initial points for k-means "
+    "clustering\", 1998) can be used to select initial points by specifying "
+    "the " + PRINT_PARAM_STRING("refined_start") + " parameter.  This approach "
+    "works by taking random samplings of the dataset; to specify the number of "
+    "samplings, the " + PRINT_PARAM_STRING("samplings") + " parameter is used, "
+    "and to specify the percentage of the dataset to be used in each sample, "
+    "the " + PRINT_PARAM_STRING("percentage") + " parameter is used (it should "
+    "be a value between 0.0 and 1.0)."
     "\n\n"
     "There are several options available for the algorithm used for each Lloyd "
     "iteration, specified with the " + PRINT_PARAM_STRING("algorithm") + " "
@@ -102,6 +106,7 @@ BINDING_EXAMPLE(
 // See also...
 BINDING_SEE_ALSO("K-Means tutorial", "@doxygen/kmtutorial.html");
 BINDING_SEE_ALSO("@dbscan", "#dbscan");
+BINDING_SEE_ALSO("k-means++", "https://en.wikipedia.org/wiki/K-means%2B%2B");
 BINDING_SEE_ALSO("Using the triangle inequality to accelerate k-means (pdf)",
         "http://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf");
 BINDING_SEE_ALSO("Making k-means even faster (pdf)",
@@ -147,6 +152,8 @@ PARAM_INT_IN("samplings", "Number of samplings to perform for refined start "
     "(use when --refined_start is specified).", "S", 100);
 PARAM_DOUBLE_IN("percentage", "Percentage of dataset to use for each refined "
     "start sampling (use when --refined_start is specified).", "p", 0.02);
+PARAM_FLAG("kmeans_plus_plus", "Use the k-means++ initialization strategy to "
+    "choose initial points.", "K");
 
 PARAM_STRING_IN("algorithm", "Algorithm to use for the Lloyd iteration "
     "('naive', 'pelleg-moore', 'elkan', 'hamerly', 'dualtree', or "
@@ -176,6 +183,9 @@ static void mlpackMain()
   else
     math::RandomSeed((size_t) std::time(NULL));
 
+  RequireOnlyOnePassed({ "refined_start", "kmeans_plus_plus" }, true,
+      "Only one initialization strategy can be specified!", true);
+
   // Now, start building the KMeans type that we'll be using.  Start with the
   // initial partition policy.  The call to FindEmptyClusterPolicy<> results in
   // a call to RunKMeans<> and the algorithm is completed.
@@ -191,6 +201,11 @@ static void mlpackMain()
 
     FindEmptyClusterPolicy<RefinedStart>(RefinedStart(samplings, percentage));
   }
+  else if (IO::HasParam("kmeans_plus_plus"))
+  {
+    FindEmptyClusterPolicy<KMeansPlusPlusInitialization>(
+        KMeansPlusPlusInitialization());
+  }
   else
   {
     FindEmptyClusterPolicy<SampleInitialization>(SampleInitialization());
@@ -271,7 +286,7 @@ void RunKMeans(const InitialPartitionPolicy& ipp)
   const int maxIterations = IO::GetParam<int>("max_iterations");
 
   // Make sure we have an output file if we're not doing the work in-place.
-  RequireAtLeastOnePassed({ "in_place", "output", "centroid" }, false,
+  RequireOnlyOnePassed({ "in_place", "output", "centroid" }, false,
       "no results will be saved");
 
   arma::mat dataset = IO::GetParam<arma::mat>("input");  // Load our dataset.

diff --git a/src/mlpack/methods/kmeans/kmeans_plus_plus_initialization.hpp b/src/mlpack/methods/kmeans/kmeans_plus_plus_initialization.hpp
@@ -0,0 +1,103 @@
+/**
+ * @file methods/kmeans/kmeans_plus_plus_initialization.hpp
+ * @author Ryan Curtin
+ *
+ * This file implements the k-means++ initialization strategy.
+ */
+#ifndef MLPACK_METHODS_KMEANS_KMEANS_PLUS_PLUS_INITIALIZATION_HPP
+#define MLPACK_METHODS_KMEANS_KMEANS_PLUS_PLUS_INITIALIZATION_HPP
+
+#include <mlpack/prereqs.hpp>
+
+/**
+ * This class implements the k-means++ initialization, as described in the
+ * following paper:
+ *
+ * @code
+ * @inproceedings{arthur2007k,
+ *   title={k-means++: The advantages of careful seeding},
+ *   author={Arthur, David and Vassilvitskii, Sergei},
+ *   booktitle={Proceedings of the Eighteenth Annual ACM-SIAM Symposium on
+ *        Discrete Algorithms (SODA '07)},
+ *   pages={1027--1035},
+ *   year={2007},
+ *   organization={Society for Industrial and Applied Mathematics}
+ * }
+ * @endcode
+ *
+ * In accordance with mlpack's InitialPartitionPolicy template type, we only
+ * need to implement a constructor and a method to compute the initial
+ * centroids.
+ */
+class KMeansPlusPlusInitialization
+{
+ public:
+  //! Empty constructor, required by the InitialPartitionPolicy type definition.
+  KMeansPlusPlusInitialization() { }
+
+  /**
+   * Initialize the centroids matrix by randomly sampling points from the data
+   * matrix.
+   *
+   * @param data Dataset.
+   * @param clusters Number of clusters.
+   * @param centroids Matrix to put initial centroids into.
+   */
+  template<typename MatType>
+  inline static void Cluster(const MatType& data,
+                             const size_t clusters,
+                             arma::mat& centroids)
+  {
+    centroids.set_size(data.n_rows, clusters);
+
+    // We'll sample our first point fully randomly.
+    size_t firstPoint = mlpack::math::RandInt(0, data.n_cols);
+    centroids.col(0) = data.col(firstPoint);
+
+    // Utility variable.
+    arma::vec distribution(data.n_cols);
+
+    // Now, sample other points...
+    for (size_t i = 1; i < clusters; ++i)
+    {
+      // We must compute the CDF for sampling... this depends on the computation
+      // of the minimum distance between each point and its closest
+      // already-chosen centroid.
+      //
+      // This computation is ripe for speedup with trees!  I am not sure exactly
+      // how much we would need to approximate, but I think it could be done
+      // without breaking the O(log k)-competitive guarantee (I think).
+      for (size_t p = 0; p < data.n_cols; ++p)
+      {
+        double minDistance = std::numeric_limits<double>::max();
+        for (size_t j = 0; j < i; ++j)
+        {
+          const double distance =
+              mlpack::metric::SquaredEuclideanDistance::Evaluate(data.col(p),
+              centroids.col(j));
+          minDistance = std::min(distance, minDistance);
+        }
+
+        distribution[p] = minDistance;
+      }
+
+      // Next normalize the distribution (actually technically we could avoid
+      // this).
+      distribution /= arma::accu(distribution);
+
+      // Turn it into a CDF for convenience...
+      for (size_t j = 1; j < distribution.n_elem; ++j)
+        distribution[j] += distribution[j - 1];
+
+      // Sample a point...
+      const double sampleValue = mlpack::math::Random();
+      const double* elem = std::lower_bound(distribution.begin(),
+          distribution.end(), sampleValue);
+      const size_t position = (size_t)
+          (elem - distribution.begin()) / sizeof(double);
+      centroids.col(i) = data.col(position);
+    }
+  }
+};
+
+#endif
diff --git a/src/mlpack/tests/kmeans_test.cpp b/src/mlpack/tests/kmeans_test.cpp
@@ -12,6 +12,7 @@
 #include <mlpack/methods/kmeans/kmeans.hpp>
 #include <mlpack/methods/kmeans/allow_empty_clusters.hpp>
 #include <mlpack/methods/kmeans/refined_start.hpp>
+#include <mlpack/methods/kmeans/kmeans_plus_plus_initialization.hpp>
 #include <mlpack/methods/kmeans/elkan_kmeans.hpp>
 #include <mlpack/methods/kmeans/hamerly_kmeans.hpp>
 #include <mlpack/methods/kmeans/pelleg_moore_kmeans.hpp>
@@ -486,6 +487,71 @@ TEST_CASE("RefinedStartTest", "[KMeansTest]")
   REQUIRE(distortion < 14000.0);
 }
 
+/**
+ * Test that the k-means++ initialization strategy returns decent initial
+ * cluster estimates.
+ */
+TEST_CASE("KMeansPlusPlusTest", "[KMeansTest]")
+{
+  // Our dataset will be five Gaussians of largely varying numbers of points and
+  // we expect that the refined starting policy should return good guesses at
+  // what these Gaussians are.
+  arma::mat data(3, 3000);
+  data.randn();
+
+  // First Gaussian: 10000 points, centered at (0, 0, 0).
+  // Second Gaussian: 2000 points, centered at (5, 0, -2).
+  // Third Gaussian: 5000 points, centered at (-2, -2, -2).
+  // Fourth Gaussian: 1000 points, centered at (-6, 8, 8).
+  // Fifth Gaussian: 12000 points, centered at (1, 6, 1).
+  arma::mat centroids(" 0  5 -2 -6  1;"
+                      " 0  0 -2  8  6;"
+                      " 0 -2 -2  8  1");
+
+  for (size_t i = 1000; i < 1200; ++i)
+    data.col(i) += centroids.col(1);
+  for (size_t i = 1200; i < 1700; ++i)
+    data.col(i) += centroids.col(2);
+  for (size_t i = 1700; i < 1800; ++i)
+    data.col(i) += centroids.col(3);
+  for (size_t i = 1800; i < 3000; ++i)
+    data.col(i) += centroids.col(4);
+
+  KMeansPlusPlusInitialization k;
+  arma::mat resultingCentroids;
+  k.Cluster(data, 5, resultingCentroids);
+
+  // Calculate resulting assignments.
+  arma::Row<size_t> assignments(data.n_cols);
+  for (size_t i = 0; i < data.n_cols; ++i)
+  {
+    double bestDist = DBL_MAX;
+    for (size_t j = 0; j < 5; ++j)
+    {
+      const double dist = metric::EuclideanDistance::Evaluate(data.col(i),
+          resultingCentroids.col(j));
+      if (dist < bestDist)
+      {
+        bestDist = dist;
+        assignments[i] = j;
+      }
+    }
+  }
+
+  // Calculate sum of distances from centroid means.
+  double distortion = 0;
+  for (size_t i = 0; i < 3000; ++i)
+    distortion += metric::EuclideanDistance::Evaluate(data.col(i),
+        resultingCentroids.col(assignments[i]));
+
+  // Using k-means++, the distance for this dataset is usually around
+  // 10000.  Regular k-means is between 10000 and 30000 (I think the 10000
+  // figure is a corner case which actually does not give good clusters), and
+  // random initial starts give distortion around 22000.  So we'll require that
+  // our distortion is less than 12000.
+  REQUIRE(distortion < 12000.0);
+}
+
 #ifdef ARMA_HAS_SPMAT
 /**
  * Make sure sparse k-means works okay.