[![Binder](https://mybinder.org/badge_logo.svg)](https://lab.mlpack.org/v2/gh/mlpack/examples/master?urlpath=lab%2Ftree%2Fcifar10_transformation_with_pca%2Fcifar-10-pca-cpp.ipynb)

In [1]:
/**
 * @file cifar-10-pca-cpp.ipynb
 *
 * A simple example usage of Principal Component Analysis (PCA)
 * applied to the CIFAR-10 dataset.
 * 
 * https://www.cs.toronto.edu/~kriz/cifar.html
 */

In [2]:
!wget -O - https://lab.mlpack.org/data/cifar10-images.tar.gz | tar -xz

In [3]:
#include <mlpack/xeus-cling.hpp>

#include <mlpack/core.hpp>
#include <mlpack/methods/pca/pca.hpp>

// Additional decomposition methods.
#include <mlpack/methods/pca/decomposition_policies/exact_svd_method.hpp>
#include <mlpack/methods/pca/decomposition_policies/quic_svd_method.hpp>
#include <mlpack/methods/pca/decomposition_policies/randomized_svd_method.hpp>
#include <mlpack/methods/pca/decomposition_policies/randomized_block_krylov_method.hpp>

// Enable image load/save support.
#define HAS_STB

In [4]:
// Header file to list files.
#include <boost/filesystem.hpp>

// Header files to create and show the plot.
#define WITHOUT_NUMPY 1
#include "matplotlibcpp.h"
#include "xwidgets/ximage.hpp"

namespace plt = matplotlibcpp;

In [5]:
using namespace mlpack;

In [6]:
using namespace mlpack::pca;

In [7]:
// Function that returns all images and labels for the specified path.
void GetImageData(const std::string& path,
                  std::vector<std::string>& imagesPath,
                  std::vector<std::string>& imageLabels)
{
    boost::filesystem::path dir(path);

    std::string label;
    for (auto i = boost::filesystem::recursive_directory_iterator(dir);
         i != boost::filesystem::recursive_directory_iterator(); i++)
    {
        // Discard directories.
        if (boost::filesystem::is_regular_file(i->path()) &&
            i->path().extension() == ".png" &&
            i->path().filename().string()[0] != '.')
        {
            imagesPath.push_back(i->path().string());
            imageLabels.push_back(label);
        }
        else if (i->path().filename().string()[0] != '.')
        {
            label = i->path().filename().string();
        }
    }
}

// Create two list one that holds the path for each image in the training
// set and another one that holds the corresponding labels.
std::vector<std::string> trainImagesPath;
std::vector<std::string> trainLabels;
GetImageData("cifar10-images/train/", trainImagesPath, trainLabels);

// Create two list one that holds the path for each image in the testing
// set and another one that holds the corresponding labels.
std::vector<std::string> testImagesPath;
std::vector<std::string> testLabels;
GetImageData("cifar10-images/test/", testImagesPath, testLabels);

In [8]:
// Print the image shape of the first image from the testing data.
arma::Mat<unsigned char> imageMatrix;
data::ImageInfo info;
data::Load(testImagesPath[0], imageMatrix, info, false);

std::cout << "Image info -"
          << " Width:" << info.Width()
          << " Height: " << info.Height()
          << " Channels: " << info.Channels() << std::endl;

Image info - Width:32 Height: 32 Channels: 3


In [9]:
// Display image number 0 from the training data.
auto im = xw::image_from_file(trainImagesPath[0]).finalize();
im.layout().height = "300px";
std::cout << trainLabels[0] << std::endl;
im

Airplane


A Jupyter widget

In [10]:
// Display image number 2000 from the testing data.
auto im = xw::image_from_file(testImagesPath[2000]).finalize();
im.layout().height = "300px";
std::cout << testLabels[2000] << std::endl;
im

Dog


A Jupyter widget

In [11]:
// Load all images from the test set into a matrix.
arma::mat testImageMatrix;
data::Load(testImagesPath, testImageMatrix, info, false);

std::cout << testImageMatrix.n_cols << " images loaded." << std::endl;

10000 images loaded.


In [12]:
// Perform Principal Components Analysis using the randomized method.
// Other decomposition methods are 'exact', 'randomized-block-krylov', 'quic'.
//
// For more information checkout https://www.mlpack.org/doc/stable/doxygen/classmlpack_1_1pca_1_1PCA.html
// or uncomment the line below.
// ?PCA<>
PCA<RandomizedSVDPolicy> pca(true);
pca.Apply(testImageMatrix, 2);

In [13]:
// Print the first ten columns of the transformed input.
std::cout << testImageMatrix.cols(0, 10).t() << std::endl;

  -22.3394   19.6474
  -33.1218  -20.4882
   40.1167   26.8005
  -15.9143  -17.9865
    8.9120  -17.3623
    9.5697   -9.3122
  -29.6697   21.7194
  -53.8475  -19.0090
  -67.3592   -6.5962
  -15.9824   -2.8275
   -3.5093   -8.8609



In [14]:
// Helper function to get the label indices.
arma::uvec getIndices(const std::vector<std::string>& labels, const std::string& label)
{
    std::vector<arma::uword> index;
    for (size_t i = 0; i < labels.size(); ++i)
    {
        if (labels[i] == label)
            index.push_back(i);
    }
    
    arma::uvec indicesVec = arma::uvec(index);
    return indicesVec;
}

In [15]:
// Plot the transformed input.

// CIFAR-10 labels.
std::vector<std::string> labels({"Airplane", "Automobile", "Bird",
                                 "Cat", "Deer", "Dog", "Frog",
                                 "Horse", "Ship", "Truck"});
plt::figure_size(800, 800);

for (size_t i = 0; i < labels.size(); ++i)
{
    // Get the indices for the label 'Airplane'.
    arma::mat dataset = testImageMatrix.cols(getIndices(testLabels, labels[i]));
    
    // Get the data for the indices.
    std::vector<double> x = arma::conv_to<std::vector<double>>::from(dataset.row(0));
    std::vector<double> y = arma::conv_to<std::vector<double>>::from(dataset.row(1));
    
    // Set the label for the legend.
    std::map<std::string, std::string> m;
    m.insert(std::pair<std::string, std::string>("label", labels[i]));
    
    plt::scatter(x, y, 10, m);
}

plt::xlabel("Principal Component - 1");
plt::ylabel("Principal Component - 2");
plt::title("Projection of CIFAR-10 dataset onto first two principal components");
plt::legend();

plt::save("./plot.png");
auto im = xw::image_from_file("plot.png").finalize();
im

A Jupyter widget

We can observe some structure in the plot above, samples belonging to the same class are close to each other;
images that are different like a Truck and a Horse are further away. We can also observe that the
first two principal components aren't sufficient for separating the classes.