Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add k-means++ initialization. #2813

Merged
merged 8 commits into from
Jan 26, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

* Add finalizers to Julia binding model types to fix memory handling (#2756).

* Add k-means++ initialization strategy (#2813).

### mlpack 3.4.2
###### 2020-10-26
* Added Mean Absolute Percentage Error.
Expand Down
5 changes: 4 additions & 1 deletion src/mlpack/core/util/param_checks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,14 @@ namespace util {
* @param fatal If true, output goes to Log::Fatal instead of Log::Warn and an
* exception is thrown.
* @param customErrorMessage Error message to append.
* @param allowNone If true, then no error message will be thrown if none of the
* parameters in the constraints were passed.
*/
void RequireOnlyOnePassed(
const std::vector<std::string>& constraints,
const bool fatal = true,
const std::string& customErrorMessage = "");
const std::string& customErrorMessage = "",
const bool allowNone = false);

/**
* Require that at least one of the given parameters in the constraints set was
Expand Down
5 changes: 3 additions & 2 deletions src/mlpack/core/util/param_checks_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ namespace util {
inline void RequireOnlyOnePassed(
const std::vector<std::string>& constraints,
const bool fatal,
const std::string& errorMessage)
const std::string& errorMessage,
const bool allowNone)
{
if (BINDING_IGNORE_CHECK(constraints))
return;
Expand Down Expand Up @@ -57,7 +58,7 @@ inline void RequireOnlyOnePassed(
stream << "; " << errorMessage;
stream << "!" << std::endl;
}
else if (set == 0)
else if (set == 0 && !allowNone)
{
stream << (fatal ? "Must " : "Should ");

Expand Down
1 change: 1 addition & 0 deletions src/mlpack/methods/kmeans/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ set(SOURCES
kill_empty_clusters.hpp
kmeans.hpp
kmeans_impl.hpp
kmeans_plus_plus_initialization.hpp
max_variance_new_cluster.hpp
max_variance_new_cluster_impl.hpp
naive_kmeans.hpp
Expand Down
33 changes: 24 additions & 9 deletions src/mlpack/methods/kmeans/kmeans_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "allow_empty_clusters.hpp"
#include "kill_empty_clusters.hpp"
#include "refined_start.hpp"
#include "kmeans_plus_plus_initialization.hpp"
#include "elkan_kmeans.hpp"
#include "hamerly_kmeans.hpp"
#include "pelleg_moore_kmeans.hpp"
Expand Down Expand Up @@ -44,14 +45,17 @@ BINDING_LONG_DESC(
" the point furthest from the centroid of the cluster with maximum variance"
" is taken to fill that cluster."
"\n\n"
"Optionally, the Bradley and Fayyad approach (\"Refining initial points for"
" k-means clustering\", 1998) can be used to select initial points by "
"specifying the " + PRINT_PARAM_STRING("refined_start") + " parameter. "
"This approach works by taking random samplings of the dataset; to specify "
"the number of samplings, the " + PRINT_PARAM_STRING("samplings") +
" parameter is used, and to specify the percentage of the dataset to be "
"used in each sample, the " + PRINT_PARAM_STRING("percentage") +
" parameter is used (it should be a value between 0.0 and 1.0)."
"Optionally, the strategy to choose initial centroids can be specified. "
"The k-means++ algorithm can be used to choose initial centroids with "
"the " + PRINT_PARAM_STRING("kmeans_plus_plus") + " parameter. The "
"Bradley and Fayyad approach (\"Refining initial points for k-means "
"clustering\", 1998) can be used to select initial points by specifying "
"the " + PRINT_PARAM_STRING("refined_start") + " parameter. This approach "
"works by taking random samplings of the dataset; to specify the number of "
"samplings, the " + PRINT_PARAM_STRING("samplings") + " parameter is used, "
"and to specify the percentage of the dataset to be used in each sample, "
"the " + PRINT_PARAM_STRING("percentage") + " parameter is used (it should "
"be a value between 0.0 and 1.0)."
"\n\n"
"There are several options available for the algorithm used for each Lloyd "
"iteration, specified with the " + PRINT_PARAM_STRING("algorithm") + " "
Expand Down Expand Up @@ -102,6 +106,7 @@ BINDING_EXAMPLE(
// See also...
BINDING_SEE_ALSO("K-Means tutorial", "@doxygen/kmtutorial.html");
BINDING_SEE_ALSO("@dbscan", "#dbscan");
BINDING_SEE_ALSO("k-means++", "https://en.wikipedia.org/wiki/K-means%2B%2B");
BINDING_SEE_ALSO("Using the triangle inequality to accelerate k-means (pdf)",
"http://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf");
BINDING_SEE_ALSO("Making k-means even faster (pdf)",
Expand Down Expand Up @@ -147,6 +152,8 @@ PARAM_INT_IN("samplings", "Number of samplings to perform for refined start "
"(use when --refined_start is specified).", "S", 100);
PARAM_DOUBLE_IN("percentage", "Percentage of dataset to use for each refined "
"start sampling (use when --refined_start is specified).", "p", 0.02);
PARAM_FLAG("kmeans_plus_plus", "Use the k-means++ initialization strategy to "
"choose initial points.", "K");

PARAM_STRING_IN("algorithm", "Algorithm to use for the Lloyd iteration "
"('naive', 'pelleg-moore', 'elkan', 'hamerly', 'dualtree', or "
Expand Down Expand Up @@ -176,6 +183,9 @@ static void mlpackMain()
else
math::RandomSeed((size_t) std::time(NULL));

RequireOnlyOnePassed({ "refined_start", "kmeans_plus_plus" }, true,
"Only one initialization strategy can be specified!", true);

// Now, start building the KMeans type that we'll be using. Start with the
// initial partition policy. The call to FindEmptyClusterPolicy<> results in
// a call to RunKMeans<> and the algorithm is completed.
Expand All @@ -191,6 +201,11 @@ static void mlpackMain()

FindEmptyClusterPolicy<RefinedStart>(RefinedStart(samplings, percentage));
}
else if (IO::HasParam("kmeans_plus_plus"))
{
FindEmptyClusterPolicy<KMeansPlusPlusInitialization>(
KMeansPlusPlusInitialization());
}
else
{
FindEmptyClusterPolicy<SampleInitialization>(SampleInitialization());
Expand Down Expand Up @@ -271,7 +286,7 @@ void RunKMeans(const InitialPartitionPolicy& ipp)
const int maxIterations = IO::GetParam<int>("max_iterations");

// Make sure we have an output file if we're not doing the work in-place.
RequireAtLeastOnePassed({ "in_place", "output", "centroid" }, false,
RequireOnlyOnePassed({ "in_place", "output", "centroid" }, false,
"no results will be saved");

arma::mat dataset = IO::GetParam<arma::mat>("input"); // Load our dataset.
Expand Down
103 changes: 103 additions & 0 deletions src/mlpack/methods/kmeans/kmeans_plus_plus_initialization.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/**
* @file methods/kmeans/kmeans_plus_plus_initialization.hpp
* @author Ryan Curtin
*
* This file implements the k-means++ initialization strategy.
*/
#ifndef MLPACK_METHODS_KMEANS_KMEANS_PLUS_PLUS_INITIALIZATION_HPP
#define MLPACK_METHODS_KMEANS_KMEANS_PLUS_PLUS_INITIALIZATION_HPP

#include <mlpack/prereqs.hpp>

/**
* This class implements the k-means++ initialization, as described in the
* following paper:
*
* @code
* @inproceedings{arthur2007k,
* title={k-means++: The advantages of careful seeding},
* author={Arthur, David and Vassilvitskii, Sergei},
* booktitle={Proceedings of the Eighteenth Annual ACM-SIAM Symposium on
* Discrete Algorithms (SODA '07)},
* pages={1027--1035},
* year={2007},
* organization={Society for Industrial and Applied Mathematics}
* }
* @endcode
*
* In accordance with mlpack's InitialPartitionPolicy template type, we only
* need to implement a constructor and a method to compute the initial
* centroids.
*/
class KMeansPlusPlusInitialization
{
public:
//! Empty constructor, required by the InitialPartitionPolicy type definition.
KMeansPlusPlusInitialization() { }

/**
* Initialize the centroids matrix by randomly sampling points from the data
* matrix.
*
* @param data Dataset.
* @param clusters Number of clusters.
* @param centroids Matrix to put initial centroids into.
*/
template<typename MatType>
inline static void Cluster(const MatType& data,
const size_t clusters,
arma::mat& centroids)
{
centroids.set_size(data.n_rows, clusters);

// We'll sample our first point fully randomly.
size_t firstPoint = mlpack::math::RandInt(0, data.n_cols);
centroids.col(0) = data.col(firstPoint);

// Utility variable.
arma::vec distribution(data.n_cols);

// Now, sample other points...
for (size_t i = 1; i < clusters; ++i)
{
// We must compute the CDF for sampling... this depends on the computation
// of the minimum distance between each point and its closest
// already-chosen centroid.
//
// This computation is ripe for speedup with trees! I am not sure exactly
// how much we would need to approximate, but I think it could be done
// without breaking the O(log k)-competitive guarantee (I think).
for (size_t p = 0; p < data.n_cols; ++p)
{
double minDistance = std::numeric_limits<double>::max();
for (size_t j = 0; j < i; ++j)
{
const double distance =
mlpack::metric::SquaredEuclideanDistance::Evaluate(data.col(p),
shrit marked this conversation as resolved.
Show resolved Hide resolved
centroids.col(j));
minDistance = std::min(distance, minDistance);
}

distribution[p] = minDistance;
}

// Next normalize the distribution (actually technically we could avoid
// this).
distribution /= arma::accu(distribution);

// Turn it into a CDF for convenience...
for (size_t j = 1; j < distribution.n_elem; ++j)
distribution[j] += distribution[j - 1];

// Sample a point...
const double sampleValue = mlpack::math::Random();
const double* elem = std::lower_bound(distribution.begin(),
distribution.end(), sampleValue);
const size_t position = (size_t)
(elem - distribution.begin()) / sizeof(double);
centroids.col(i) = data.col(position);
}
}
};

#endif
66 changes: 66 additions & 0 deletions src/mlpack/tests/kmeans_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <mlpack/methods/kmeans/kmeans.hpp>
#include <mlpack/methods/kmeans/allow_empty_clusters.hpp>
#include <mlpack/methods/kmeans/refined_start.hpp>
#include <mlpack/methods/kmeans/kmeans_plus_plus_initialization.hpp>
#include <mlpack/methods/kmeans/elkan_kmeans.hpp>
#include <mlpack/methods/kmeans/hamerly_kmeans.hpp>
#include <mlpack/methods/kmeans/pelleg_moore_kmeans.hpp>
Expand Down Expand Up @@ -486,6 +487,71 @@ TEST_CASE("RefinedStartTest", "[KMeansTest]")
REQUIRE(distortion < 14000.0);
}

/**
* Test that the k-means++ initialization strategy returns decent initial
* cluster estimates.
*/
TEST_CASE("KMeansPlusPlusTest", "[KMeansTest]")
{
// Our dataset will be five Gaussians of largely varying numbers of points and
// we expect that the refined starting policy should return good guesses at
// what these Gaussians are.
arma::mat data(3, 3000);
data.randn();

// First Gaussian: 10000 points, centered at (0, 0, 0).
// Second Gaussian: 2000 points, centered at (5, 0, -2).
// Third Gaussian: 5000 points, centered at (-2, -2, -2).
// Fourth Gaussian: 1000 points, centered at (-6, 8, 8).
// Fifth Gaussian: 12000 points, centered at (1, 6, 1).
arma::mat centroids(" 0 5 -2 -6 1;"
" 0 0 -2 8 6;"
" 0 -2 -2 8 1");

for (size_t i = 1000; i < 1200; ++i)
data.col(i) += centroids.col(1);
for (size_t i = 1200; i < 1700; ++i)
data.col(i) += centroids.col(2);
for (size_t i = 1700; i < 1800; ++i)
data.col(i) += centroids.col(3);
for (size_t i = 1800; i < 3000; ++i)
data.col(i) += centroids.col(4);

KMeansPlusPlusInitialization k;
arma::mat resultingCentroids;
k.Cluster(data, 5, resultingCentroids);

// Calculate resulting assignments.
arma::Row<size_t> assignments(data.n_cols);
for (size_t i = 0; i < data.n_cols; ++i)
{
double bestDist = DBL_MAX;
for (size_t j = 0; j < 5; ++j)
{
const double dist = metric::EuclideanDistance::Evaluate(data.col(i),
resultingCentroids.col(j));
if (dist < bestDist)
{
bestDist = dist;
assignments[i] = j;
}
}
}

// Calculate sum of distances from centroid means.
double distortion = 0;
for (size_t i = 0; i < 3000; ++i)
distortion += metric::EuclideanDistance::Evaluate(data.col(i),
resultingCentroids.col(assignments[i]));

// Using k-means++, the distance for this dataset is usually around
// 10000. Regular k-means is between 10000 and 30000 (I think the 10000
// figure is a corner case which actually does not give good clusters), and
// random initial starts give distortion around 22000. So we'll require that
// our distortion is less than 12000.
REQUIRE(distortion < 12000.0);
}

#ifdef ARMA_HAS_SPMAT
/**
* Make sure sparse k-means works okay.
Expand Down