[![Binder](https://mybinder.org/badge_logo.svg)](https://lab.mlpack.org/v2/gh/mlpack/examples/master?urlpath=lab%2Ftree%2Fpima_indians_diabetes_clustering_with_kmeans%2Fpima-indians-diabetes-kmeans-cpp.ipynb)

In [1]:
/**
 * @file pima-indians-diabetes-kmeans-cpp.ipynb
 *
 * A simple example usage of K-Means clustering
 * applied to the Pima Indians Diabetes dataset.
 * 
 * https://www.kaggle.com/uciml/pima-indians-diabetes-database
 */

Download Pima Indians Diabetes dataset.

In [2]:
!wget -q -O pima-indians-diabetes.csv https://lab.mlpack.org/data/pima-indians-diabetes.csv

In [3]:
#include <mlpack/xeus-cling.hpp>

#include <mlpack/core.hpp>
#include <mlpack/methods/pca/pca.hpp>
#include <mlpack/methods/kmeans/kmeans.hpp>

#include <sstream>

In [4]:
// Header files to create and show the plot.
#define WITHOUT_NUMPY 1
#include "matplotlibcpp.h"
#include "xwidgets/ximage.hpp"
#include "scatter.hpp"

namespace plt = matplotlibcpp;

In [5]:
using namespace mlpack;

In [6]:
using namespace mlpack::pca;

In [7]:
using namespace mlpack::kmeans;

In [8]:
// The dataset is originally from the National Institute of Diabetes and
// Digestive and Kidney Diseases and can be used to predict whether a
// patient has diabetes based on certain diagnostic factors.
arma::mat input;
data::Load("pima-indians-diabetes.csv", input);

In [9]:
// Print the first 10 rows of the input data.
std::cout << std::setw(18) << "Pregnancies "
          << std::setw(10) << "Glucose "
          << "BloodPressure "
          << std::left << std::setw(18) << "SkinThickness "
          << std::left << std::setw(15) << "Insulin "
          << "BMI "
          << "DiabetesPedigreeFunction "
          << "Age "
          << "Outcome " << std::endl;

std::cout << input.submat(0, 0, input.n_rows - 1 , 10).t() << std::endl;

      Pregnancies   Glucose BloodPressure SkinThickness     Insulin        BMI DiabetesPedigreeFunction Age Outcome 
   6.0000e+00   1.4800e+02   7.2000e+01   3.5000e+01            0   3.3600e+01   6.2700e-01   5.0000e+01   1.0000e+00
   1.0000e+00   8.5000e+01   6.6000e+01   2.9000e+01            0   2.6600e+01   3.5100e-01   3.1000e+01            0
   8.0000e+00   1.8300e+02   6.4000e+01            0            0   2.3300e+01   6.7200e-01   3.2000e+01   1.0000e+00
   1.0000e+00   8.9000e+01   6.6000e+01   2.3000e+01   9.4000e+01   2.8100e+01   1.6700e-01   2.1000e+01            0
            0   1.3700e+02   4.0000e+01   3.5000e+01   1.6800e+02   4.3100e+01   2.2880e+00   3.3000e+01   1.0000e+00
   5.0000e+00   1.1600e+02   7.4000e+01            0            0   2.5600e+01   2.0100e-01   3.0000e+01            0
   3.0000e+00   7.8000e+01   5.0000e+01   3.2000e+01   8.8000e+01   3.1000e+01   2.4800e-01   2.6000e+01   1.0000e+00
   1.0000e+01   1.1500e+02            0            0     

In [10]:
// Split the labels last column.
arma::rowvec labels = input.row(input.n_rows - 1);
arma::mat dataset = input.rows(0, input.n_rows - 2);

In [11]:
// For the convenience of visualization, we take the first two principle components
// as the new feature variables and conduct K-means only on these two dimensional data.
PCA<> pca(true);
pca.Apply(dataset, 2);

In [12]:
// Print the first ten columns of the transformed input.
std::cout << dataset.cols(0, 10).t() << std::endl;

  -1.0683   1.2336
   1.1212  -0.7328
   0.3955   1.5955
   1.1155  -1.2698
  -2.3571  -2.1829
   1.4345   0.8358
   0.9299  -1.1068
   2.1478   0.8566
  -3.2957  -0.2424
   1.8374   3.2053
   0.5320   0.6473



In [13]:
// Plot the transformed input.

// Get the data to for the indices.
std::vector<double> x = arma::conv_to<std::vector<double>>::from(dataset.row(0));
std::vector<double> y = arma::conv_to<std::vector<double>>::from(dataset.row(1));

plt::figure_size(800, 800);
plt::scatter(x, y, 4);

plt::xlabel("Principal Component - 1");
plt::ylabel("Principal Component - 2");
plt::title("Projection of Pima Indians Diabetes dataset onto first two principal components");

plt::save("./pca.png");
auto im = xw::image_from_file("pca.png").finalize();
im

A Jupyter widget

In [14]:
// Perform K-means clustering using the Euclidean distance.
//
// For more information checkout https://mlpack.org/doc/stable/doxygen/classmlpack_1_1kmeans_1_1KMeans.html
// or uncomment the line below.
// ?KMeans<>

// The assignments will be stored in this vector.
arma::Row<size_t> assignments;

// The centroids will be stored in this matrix.
arma::mat centroids;

// The number of clusters we are getting.
size_t cluster = 13;

// Number of optimization steps to perform.
size_t iterations = 30;

// Generate data string to plot the data.
std::stringstream xData, yData, aData, cData;
for (size_t i = 0; i < dataset.n_cols; ++i)
{
    xData << dataset.col(i)(0) << ";";
    yData << dataset.col(i)(1) << ";";
}

// Collect the assignments and centroids for each
// optimization step. This is just done to plot the
// optimization step, a user can avoid the lines
// below and use:
// KMeans<> kmeans;
// kmeans.Cluster(dataset, cluster, assignments, centroids);
// To discard the intermediate steps.
for (size_t i = 0; i < iterations; ++i)
{
    // Initialize with the default arguments.
    KMeans<> kmeans;
    // Set the number of optimization steps to one, just
    // for the purpose of ploting the optimization process.
    kmeans.MaxIterations() = 1;
    
    // Start with the given assignments and centroids if
    // this is not the first step.
    if (i == 0)
        kmeans.Cluster(dataset, cluster, assignments, centroids);
    else
        kmeans.Cluster(dataset, cluster, assignments, centroids, true, true);
    
    // Create assignments string for plotting.
    for (size_t j = 0; j < assignments.n_elem; ++j)
        aData << assignments(j) << ";";

    // Create centroids string for plotting.
    for (size_t j = 0; j < centroids.n_elem; ++j)
        cData << centroids(j) << ";";
}

In [15]:
// Plot the K-means optimization steps.
Scatter(xData.str()  /* Dataset first feature. */,
        yData.str()  /* Dataset second feature. */,
        aData.str()  /* K-means assignments. */,
        cData.str()  /* K-means centroids. */,
        iterations,  /* Number of optimization steps. */
        "output.gif" /* Output file. */);

In [16]:
auto im = xw::image_from_file("output.gif").finalize();
im

A Jupyter widget

Scatter plot with different clusters in different colors. The black cross marks the centers of a cluster.