diff --git a/src/mlpack/methods/CMakeLists.txt b/src/mlpack/methods/CMakeLists.txt index 5734d5c9d8a..39aa62fd036 100644 --- a/src/mlpack/methods/CMakeLists.txt +++ b/src/mlpack/methods/CMakeLists.txt @@ -46,6 +46,7 @@ set(DIRS perceptron quic_svd radical + randomized_svd range_search rann rmva diff --git a/src/mlpack/methods/pca/CMakeLists.txt b/src/mlpack/methods/pca/CMakeLists.txt index 8023d831616..5389ae02157 100644 --- a/src/mlpack/methods/pca/CMakeLists.txt +++ b/src/mlpack/methods/pca/CMakeLists.txt @@ -2,7 +2,7 @@ # Anything not in this list will not be compiled into mlpack. set(SOURCES pca.hpp - pca.cpp + pca_impl.hpp ) # Add directory name to sources. @@ -14,4 +14,6 @@ endforeach() # the parent scope). set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) +add_subdirectory(decomposition_policies) + add_cli_executable(pca) diff --git a/src/mlpack/methods/pca/decomposition_policies/CMakeLists.txt b/src/mlpack/methods/pca/decomposition_policies/CMakeLists.txt new file mode 100644 index 00000000000..968c7cc4bb9 --- /dev/null +++ b/src/mlpack/methods/pca/decomposition_policies/CMakeLists.txt @@ -0,0 +1,16 @@ +# Define the files we need to compile +# Anything not in this list will not be compiled into mlpack. +set(SOURCES + exact_svd_method.hpp + randomized_svd_method.hpp + quic_svd_method.hpp +) + +# Add directory name to sources. +set(DIR_SRCS) +foreach(file ${SOURCES}) + set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) +endforeach() +# Append sources (with directory name) to list of all mlpack sources (used at +# the parent scope). +set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) diff --git a/src/mlpack/methods/pca/decomposition_policies/exact_svd_method.hpp b/src/mlpack/methods/pca/decomposition_policies/exact_svd_method.hpp new file mode 100644 index 00000000000..7b24b15ac73 --- /dev/null +++ b/src/mlpack/methods/pca/decomposition_policies/exact_svd_method.hpp @@ -0,0 +1,72 @@ +/** + * @file exact_svd_method.hpp + * @author Ajinkya Kale + * @author Ryan Curtin + * @author Marcus Edel + * + * Implementation of the exact svd method for use in the Principal Components + * Analysis method. + */ + +#ifndef MLPACK_METHODS_PCA_DECOMPOSITION_POLICIES_EXACT_SVD_METHOD_HPP +#define MLPACK_METHODS_PCA_DECOMPOSITION_POLICIES_EXACT_SVD_METHOD_HPP + +#include + +namespace mlpack { +namespace pca { + +/** + * Implementation of the exact SVD policy. + */ +class ExactSVDPolicy +{ + public: + /** + * Apply Principal Component Analysis to the provided data set using the + * exact SVD method. + * + * @param data Data matrix. + * @param centeredData Centered data matrix. + * @param transformedData Matrix to put results of PCA into. + * @param eigVal Vector to put eigenvalues into. + * @param eigvec Matrix to put eigenvectors (loadings) into. + * @param rank Rank of the decomposition. + */ + void Apply(const arma::mat& data, + const arma::mat& centeredData, + arma::mat& transformedData, + arma::vec& eigVal, + arma::mat& eigvec, + const size_t /* rank */) + { + // This matrix will store the right singular values; we do not need them. + arma::mat v; + + // Do singular value decomposition. Use the economical singular value + // decomposition if the columns are much larger than the rows. + if (data.n_rows < data.n_cols) + { + // Do economical singular value decomposition and compute only the left + // singular vectors. + arma::svd_econ(eigvec, eigVal, v, centeredData, 'l'); + } + else + { + arma::svd(eigvec, eigVal, v, centeredData); + } + + // Now we must square the singular values to get the eigenvalues. + // In addition we must divide by the number of points, because the + // covariance matrix is X * X' / (N - 1). + eigVal %= eigVal / (data.n_cols - 1); + + // Project the samples to the principals. + transformedData = arma::trans(eigvec) * centeredData; + } +}; + +} // namespace pca +} // namespace mlpack + +#endif diff --git a/src/mlpack/methods/pca/decomposition_policies/quic_svd_method.hpp b/src/mlpack/methods/pca/decomposition_policies/quic_svd_method.hpp new file mode 100644 index 00000000000..c866a8f4022 --- /dev/null +++ b/src/mlpack/methods/pca/decomposition_policies/quic_svd_method.hpp @@ -0,0 +1,92 @@ +/** + * @file quic_svd_method.hpp + * @author Marcus Edel + * + * Implementation of the QUIC-SVD policy for use in the Principal Components + * Analysis method. + */ + +#ifndef MLPACK_METHODS_PCA_DECOMPOSITION_POLICIES_QUIC_SVD_METHOD_HPP +#define MLPACK_METHODS_PCA_DECOMPOSITION_POLICIES_QUIC_SVD_METHOD_HPP + +#include +#include + +namespace mlpack { +namespace pca { + +/** + * Implementation of the QUIC-SVD policy. + */ +class QUICSVDPolicy +{ + public: + + /** + * Use QUIC-SVD method to perform the principal components analysis (PCA). + * + * @param epsilon Error tolerance fraction for calculated subspace. + * @param delta Cumulative probability for Monte Carlo error lower bound. + */ + QUICSVDPolicy(const double epsilon = 0.03, const double delta = 0.1) : + epsilon(epsilon), + delta(delta) + { + /* Nothing to do here */ + } + + /** + * Apply Principal Component Analysis to the provided data set using the + * QUIC-SVD method. + * + * @param data Data matrix. + * @param centeredData Centered data matrix. + * @param transformedData Matrix to put results of PCA into. + * @param eigVal Vector to put eigenvalues into. + * @param eigvec Matrix to put eigenvectors (loadings) into. + * @param rank Rank of the decomposition. + */ + void Apply(const arma::mat& data, + const arma::mat& centeredData, + arma::mat& transformedData, + arma::vec& eigVal, + arma::mat& eigvec, + const size_t /* rank */) + { + // This matrix will store the right singular values; we do not need them. + arma::mat v, sigma; + + // Do singular value decomposition using the QUIC-SVD algorithm. + svd::QUIC_SVD quicsvd(centeredData, eigvec, v, sigma, epsilon, delta); + + // Now we must square the singular values to get the eigenvalues. + // In addition we must divide by the number of points, because the + // covariance matrix is X * X' / (N - 1). + eigVal = arma::pow(arma::diagvec(sigma), 2) / (data.n_cols - 1); + + // Project the samples to the principals. + transformedData = arma::trans(eigvec) * centeredData; + } + + //! Get the error tolerance fraction for calculated subspace. + double Epsilon() const { return epsilon; } + //! Modify the error tolerance fraction for calculated subspace. + double& Epsilon() { return epsilon; } + + //! Get the cumulative probability for Monte Carlo error lower bound. + double Delta() const { return delta; } + //! Modify the cumulative probability for Monte Carlo error lower bound. + double& Delta() { return delta; } + + private: + //! Error tolerance fraction for calculated subspace. + double epsilon; + + //! Cumulative probability for Monte Carlo error lower bound. + double delta; +}; + +} // namespace pca +} // namespace mlpack + +#endif diff --git a/src/mlpack/methods/pca/decomposition_policies/randomized_svd_method.hpp b/src/mlpack/methods/pca/decomposition_policies/randomized_svd_method.hpp new file mode 100644 index 00000000000..767eb9b5d39 --- /dev/null +++ b/src/mlpack/methods/pca/decomposition_policies/randomized_svd_method.hpp @@ -0,0 +1,97 @@ +/** + * @file randomized_svd_method.hpp + * @author Marcus Edel + * + * Implementation of the randomized svd method for use in the Principal + * Components Analysis method. + */ + +#ifndef MLPACK_METHODS_PCA_DECOMPOSITION_POLICIES_RANDOMIZED_SVD_METHOD_HPP +#define MLPACK_METHODS_PCA_DECOMPOSITION_POLICIES_RANDOMIZED_SVD_METHOD_HPP + +#include +#include +#include + +namespace mlpack { +namespace pca { + +/** + * Implementation of the randomized SVD policy. + */ +class RandomizedSVDPolicy +{ + public: + /** + * Use randomized SVD method to perform the principal components analysis + * (PCA). + * + * @param iteratedPower Size of the normalized power iterations + * (Default: rank + 2). + * @param maxIterations Number of iterations for the power method + * (Default: 2). + */ + RandomizedSVDPolicy(const size_t iteratedPower = 0, + const size_t maxIterations = 2) : + iteratedPower(iteratedPower), + maxIterations(maxIterations) + { + /* Nothing to do here */ + } + + /** + * Apply Principal Component Analysis to the provided data set using the + * randomized SVD. + * + * @param data Data matrix. + * @param centeredData Centered data matrix. + * @param transformedData Matrix to put results of PCA into. + * @param eigVal Vector to put eigenvalues into. + * @param eigvec Matrix to put eigenvectors (loadings) into. + * @param rank Rank of the decomposition. + */ + void Apply(const arma::mat& data, + const arma::mat& centeredData, + arma::mat& transformedData, + arma::vec& eigVal, + arma::mat& eigvec, + const size_t rank) + { + // This matrix will store the right singular values; we do not need them. + arma::mat v; + + // Do singular value decomposition using the randomized SVD algorithm. + svd::RandomizedSVD rsvd(iteratedPower, maxIterations); + rsvd.Apply(data, eigvec, eigVal, v, rank); + + // Now we must square the singular values to get the eigenvalues. + // In addition we must divide by the number of points, because the + // covariance matrix is X * X' / (N - 1). + eigVal %= eigVal / (data.n_cols - 1); + + // Project the samples to the principals. + transformedData = arma::trans(eigvec) * centeredData; + } + + //! Get the size of the normalized power iterations. + size_t IteratedPower() const { return iteratedPower; } + //! Modify the size of the normalized power iterations. + size_t& IteratedPower() { return iteratedPower; } + + //! Get the number of iterations for the power method. + size_t MaxIterations() const { return maxIterations; } + //! Modify the number of iterations for the power method. + size_t& MaxIterations() { return maxIterations; } + + private: + //! Locally stored size of the normalized power iterations. + size_t iteratedPower; + + //! Locally stored number of iterations for the power method. + size_t maxIterations; +}; + +} // namespace pca +} // namespace mlpack + +#endif diff --git a/src/mlpack/methods/pca/pca.hpp b/src/mlpack/methods/pca/pca.hpp index c698110c066..41c7517dfef 100644 --- a/src/mlpack/methods/pca/pca.hpp +++ b/src/mlpack/methods/pca/pca.hpp @@ -1,26 +1,31 @@ /** * @file pca.hpp * @author Ajinkya Kale + * @author Ryan Curtin + * @author Marcus Edel * * Defines the PCA class to perform Principal Components Analysis on the - * specified data set. + * specified data set. There are many variations on how to do this, so + * template parameters allow the selection of different techniques. */ #ifndef MLPACK_METHODS_PCA_PCA_HPP #define MLPACK_METHODS_PCA_PCA_HPP #include +#include namespace mlpack { namespace pca { /** - * This class implements principal components analysis (PCA). This is a common, - * widely-used technique that is often used for either dimensionality reduction - * or transforming data into a better basis. Further information on PCA can be - * found in almost any statistics or machine learning textbook, and all over the - * internet. + * This class implements principal components analysis (PCA). This is a + * common, widely-used technique that is often used for either dimensionality + * reduction or transforming data into a better basis. Further information on + * PCA can be found in almost any statistics or machine learning textbook, and + * all over the internet. */ -class PCA +template +class PCAType { public: /** @@ -29,11 +34,12 @@ class PCA * * @param scaleData Whether or not to scale the data. */ - PCA(const bool scaleData = false); + PCAType(const bool scaleData = false, + const DecompositionPolicy& decomposition = DecompositionPolicy()); /** - * Apply Principal Component Analysis to the provided data set. It is safe to - * pass the same matrix reference for both data and transformedData. + * Apply Principal Component Analysis to the provided data set. It is safe + * to pass the same matrix reference for both data and transformedData. * * @param data Data matrix. * @param transformedData Matrix to put results of PCA into. @@ -43,11 +49,11 @@ class PCA void Apply(const arma::mat& data, arma::mat& transformedData, arma::vec& eigval, - arma::mat& eigvec) const; + arma::mat& eigvec); /** - * Apply Principal Component Analysis to the provided data set. It is safe to - * pass the same matrix reference for both data and transformedData. + * Apply Principal Component Analysis to the provided data set. It is safe + * to pass the same matrix reference for both data and transformedData. * * @param data Data matrix. * @param transformedData Matrix to store results of PCA in. @@ -55,29 +61,29 @@ class PCA */ void Apply(const arma::mat& data, arma::mat& transformedData, - arma::vec& eigVal) const; + arma::vec& eigVal); /** - * Use PCA for dimensionality reduction on the given dataset. This will save + * Use PCA for dimensionality reduction on the given dataset. This will save * the newDimension largest principal components of the data and remove the - * rest. The parameter returned is the amount of variance of the data that is - * retained; this is a value between 0 and 1. For instance, a value of 0.9 - * indicates that 90% of the variance present in the data was retained. + * rest. The parameter returned is the amount of variance of the data that + * is retained; this is a value between 0 and 1. For instance, a value of + * 0.9 indicates that 90% of the variance present in the data was retained. * * @param data Data matrix. * @param newDimension New dimension of the data. * @return Amount of the variance of the data retained (between 0 and 1). */ - double Apply(arma::mat& data, const size_t newDimension) const; + double Apply(arma::mat& data, const size_t newDimension); //! This overload is here to make sure int gets casted right to size_t. - inline double Apply(arma::mat& data, const int newDimension) const + inline double Apply(arma::mat& data, const int newDimension) { return Apply(data, size_t(newDimension)); } /** - * Use PCA for dimensionality reduction on the given dataset. This will save + * Use PCA for dimensionality reduction on the given dataset. This will save * as many dimensions as necessary to retain at least the given amount of * variance (specified by parameter varRetained). The amount should be * between 0 and 1; if the amount is 0, then only 1 dimension will be @@ -91,23 +97,51 @@ class PCA * between 0 and 1. * @return Actual amount of variance retained (between 0 and 1). */ - double Apply(arma::mat& data, const double varRetained) const; + double Apply(arma::mat& data, const double varRetained); - //! Get whether or not this PCA object will scale (by standard deviation) the - //! data when PCA is performed. + //! Get whether or not this PCA object will scale (by standard deviation) + //! the data when PCA is performed. bool ScaleData() const { return scaleData; } //! Modify whether or not this PCA object will scale (by standard deviation) //! the data when PCA is performed. bool& ScaleData() { return scaleData; } private: + //! Scaling the data is when we reduce the variance of each dimension to 1. + void ScaleData(arma::mat& centeredData) + { + if (scaleData) + { + // Scaling the data is when we reduce the variance of each dimension + // to 1. We do this by dividing each dimension by its standard + // deviation. + arma::vec stdDev = arma::stddev( + centeredData, 0, 1 /* for each dimension */); + + // If there are any zeroes, make them very small. + for (size_t i = 0; i < stdDev.n_elem; ++i) + if (stdDev[i] == 0) + stdDev[i] = 1e-50; + + centeredData /= arma::repmat(stdDev, 1, centeredData.n_cols); + } + } + //! Whether or not the data will be scaled by standard deviation when PCA is //! performed. bool scaleData; + //! Decomposition method used to perform principal components analysis. + DecompositionPolicy decomposition; }; // class PCA +//! 3.0.0 TODO: break reverse-compatibility by changing PCAType to PCA. +typedef PCAType PCA; + } // namespace pca } // namespace mlpack +// Include implementation. +#include "pca_impl.hpp" + #endif diff --git a/src/mlpack/methods/pca/pca.cpp b/src/mlpack/methods/pca/pca_impl.hpp similarity index 66% rename from src/mlpack/methods/pca/pca.cpp rename to src/mlpack/methods/pca/pca_impl.hpp index 62c78def76a..9e3a89cdc0d 100644 --- a/src/mlpack/methods/pca/pca.cpp +++ b/src/mlpack/methods/pca/pca_impl.hpp @@ -1,19 +1,29 @@ /** * @file pca.cpp * @author Ajinkya Kale + * @author Ryan Curtin + * @author Marcus Edel * * Implementation of PCA class to perform Principal Components Analysis on the * specified data set. */ -#include "pca.hpp" + +#ifndef MLPACK_METHODS_PCA_PCA_IMPL_HPP +#define MLPACK_METHODS_PCA_PCA_IMPL_HPP + #include +#include "pca.hpp" using namespace std; -using namespace mlpack; -using namespace mlpack::pca; -PCA::PCA(const bool scaleData) : - scaleData(scaleData) +namespace mlpack { +namespace pca { + +template +PCAType::PCAType(const bool scaleData, + const DecompositionPolicy& decomposition) : + scaleData(scaleData), + decomposition(decomposition) { } /** @@ -24,54 +34,23 @@ PCA::PCA(const bool scaleData) : * @param eigVal - contains eigen values in a column vector * @param coeff - PCA Loadings/Coeffs/EigenVectors */ -void PCA::Apply(const arma::mat& data, - arma::mat& transformedData, - arma::vec& eigVal, - arma::mat& coeff) const +template +void PCAType::Apply(const arma::mat& data, + arma::mat& transformedData, + arma::vec& eigVal, + arma::mat& coeff) { Timer::Start("pca"); - // This matrix will store the right singular values; we do not need them. - arma::mat v; - // Center the data into a temporary matrix. arma::mat centeredData; math::Center(data, centeredData); - if (scaleData) - { - // Scaling the data is when we reduce the variance of each dimension to 1. - // We do this by dividing each dimension by its standard deviation. - arma::vec stdDev = arma::stddev(centeredData, 0, 1 /* for each dimension */); - - // If there are any zeroes, make them very small. - for (size_t i = 0; i < stdDev.n_elem; ++i) - if (stdDev[i] == 0) - stdDev[i] = 1e-50; - - centeredData /= arma::repmat(stdDev, 1, centeredData.n_cols); - } - - // Do singular value decomposition. Use the economical singular value - // decomposition if the columns are much larger than the rows. - if (data.n_rows < data.n_cols) - { - // Do economical singular value decomposition and compute only the left - // singular vectors. - arma::svd_econ(coeff, eigVal, v, centeredData, 'l'); - } - else - { - arma::svd(coeff, eigVal, v, centeredData); - } - - // Now we must square the singular values to get the eigenvalues. - // In addition we must divide by the number of points, because the covariance - // matrix is X * X' / (N - 1). - eigVal %= eigVal / (data.n_cols - 1); + // Scale the data if the user ask for. + ScaleData(centeredData); - // Project the samples to the principals. - transformedData = arma::trans(coeff) * centeredData; + decomposition.Apply(data, centeredData, transformedData, eigVal, coeff, + data.n_rows); Timer::Stop("pca"); } @@ -83,9 +62,10 @@ void PCA::Apply(const arma::mat& data, * @param transformedData - Data with PCA applied * @param eigVal - contains eigen values in a column vector */ -void PCA::Apply(const arma::mat& data, - arma::mat& transformedData, - arma::vec& eigVal) const +template +void PCAType::Apply(const arma::mat& data, + arma::mat& transformedData, + arma::vec& eigVal) { arma::mat coeffs; Apply(data, transformedData, eigVal, coeffs); @@ -102,7 +82,9 @@ void PCA::Apply(const arma::mat& data, * @param newDimension New dimension of the data. * @return Amount of the variance of the data retained (between 0 and 1). */ -double PCA::Apply(arma::mat& data, const size_t newDimension) const +template +double PCAType::Apply(arma::mat& data, + const size_t newDimension) { // Parameter validation. if (newDimension == 0) @@ -116,7 +98,16 @@ double PCA::Apply(arma::mat& data, const size_t newDimension) const arma::mat coeffs; arma::vec eigVal; - Apply(data, data, eigVal, coeffs); + Timer::Start("pca"); + + // Center the data into a temporary matrix. + arma::mat centeredData; + math::Center(data, centeredData); + + // Scale the data if the user ask for. + ScaleData(centeredData); + + decomposition.Apply(data, centeredData, data, eigVal, coeffs, newDimension); if (newDimension < coeffs.n_rows) // Drop unnecessary rows. @@ -126,6 +117,8 @@ double PCA::Apply(arma::mat& data, const size_t newDimension) const // the right dimension before calculating the amount of variance retained. double eigDim = std::min(newDimension - 1, (size_t) eigVal.n_elem - 1); + Timer::Stop("pca"); + // Calculate the total amount of variance retained. return (sum(eigVal.subvec(0, eigDim)) / sum(eigVal)); } @@ -140,7 +133,9 @@ double PCA::Apply(arma::mat& data, const size_t newDimension) const * The method returns the actual amount of variance retained, which will * always be greater than or equal to the varRetained parameter. */ -double PCA::Apply(arma::mat& data, const double varRetained) const +template +double PCAType::Apply(arma::mat& data, + const double varRetained) { // Parameter validation. if (varRetained < 0) @@ -171,3 +166,8 @@ double PCA::Apply(arma::mat& data, const double varRetained) const return varSum; } + +} // namespace pca +} // namespace mlpack + +#endif \ No newline at end of file diff --git a/src/mlpack/methods/pca/pca_main.cpp b/src/mlpack/methods/pca/pca_main.cpp index 8bbbedf0fcd..907939102fe 100644 --- a/src/mlpack/methods/pca/pca_main.cpp +++ b/src/mlpack/methods/pca/pca_main.cpp @@ -1,12 +1,16 @@ /** * @file pca_main.cpp * @author Ryan Curtin + * @author Marcus Edel * * Main executable to run PCA. */ #include #include "pca.hpp" +#include +#include +#include using namespace mlpack; using namespace mlpack::pca; @@ -14,10 +18,10 @@ using namespace std; // Document program. PROGRAM_INFO("Principal Components Analysis", "This program performs principal " - "components analysis on the given dataset. It will transform the data " - "onto its principal components, optionally performing dimensionality " - "reduction by ignoring the principal components with the smallest " - "eigenvalues."); + "components analysis on the given dataset using the exact, randomized or " + "QUIC SVD method. It will transform the data onto its principal components," + " optionally performing dimensionality reduction by ignoring the principal " + "components with the smallest eigenvalues."); // Parameters for program. PARAM_STRING_REQ("input_file", "Input dataset to perform PCA on.", "i"); @@ -31,6 +35,40 @@ PARAM_DOUBLE("var_to_retain", "Amount of variance to retain; should be between " PARAM_FLAG("scale", "If set, the data will be scaled before running PCA, such " "that the variance of each feature is 1.", "s"); +PARAM_STRING("decomposition_method", "Method used for the principal" + "components analysis: 'exact', 'randomized', 'quic'.", "c", "exact"); + + +//! Run RunPCA on the specified dataset with the given decomposition method. +template +void RunPCA(arma::mat& dataset, + const size_t newDimension, + const size_t scale, + const double varToRetain) +{ + PCAType p(scale); + + Log::Info << "Performing PCA on dataset..." << endl; + double varRetained; + + if (varToRetain != 0) + { + if (newDimension != 0) + Log::Warn << "New dimensionality (-d) ignored because -V was specified." + << endl; + + varRetained = p.Apply(dataset, varToRetain); + } + else + { + varRetained = p.Apply(dataset, newDimension); + } + + Log::Info << (varRetained * 100) << "% of variance retained (" << + dataset.n_rows << " dimensions)." << endl; + +} + int main(int argc, char** argv) { // Parse commandline. @@ -57,27 +95,30 @@ int main(int argc, char** argv) // Get the options for running PCA. const size_t scale = CLI::HasParam("scale"); + const double varToRetain = CLI::GetParam("var_to_retain"); + const string decompositionMethod = CLI::GetParam( + "decomposition_method"); // Perform PCA. - PCA p(scale); - Log::Info << "Performing PCA on dataset..." << endl; - double varRetained; - if (CLI::GetParam("var_to_retain") != 0) + if (decompositionMethod == "exact") { - if (CLI::GetParam("new_dimensionality") != 0) - Log::Warn << "New dimensionality (-d) ignored because --var_to_retain was" - << " specified." << endl; - - varRetained = p.Apply(dataset, CLI::GetParam("var_to_retain")); + RunPCA(dataset, newDimension, scale, varToRetain); + } + else if(decompositionMethod == "randomized") + { + RunPCA(dataset, newDimension, scale, varToRetain); + } + else if(decompositionMethod == "quic") + { + RunPCA(dataset, newDimension, scale, varToRetain); } else { - varRetained = p.Apply(dataset, newDimension); + // Invalid decomposition method. + Log::Fatal << "Invalid decomposition method ('" << decompositionMethod + << "'); valid choices are 'exact', 'randomized', 'quic'." << endl; } - Log::Info << (varRetained * 100) << "% of variance retained (" << - dataset.n_rows << " dimensions)." << endl; - // Now save the results. string outputFile = CLI::GetParam("output_file"); data::Save(outputFile, dataset); diff --git a/src/mlpack/methods/quic_svd/CMakeLists.txt b/src/mlpack/methods/quic_svd/CMakeLists.txt index ae7a36adc89..ef7def2f080 100644 --- a/src/mlpack/methods/quic_svd/CMakeLists.txt +++ b/src/mlpack/methods/quic_svd/CMakeLists.txt @@ -2,7 +2,7 @@ # Anything not in this list will not be compiled into mlpack. set(SOURCES quic_svd.hpp - quic_svd_impl.hpp + quic_svd.cpp ) # Add directory name to sources. diff --git a/src/mlpack/methods/quic_svd/quic_svd_impl.hpp b/src/mlpack/methods/quic_svd/quic_svd.cpp similarity index 95% rename from src/mlpack/methods/quic_svd/quic_svd_impl.hpp rename to src/mlpack/methods/quic_svd/quic_svd.cpp index 20aaba1822d..4640d481811 100644 --- a/src/mlpack/methods/quic_svd/quic_svd_impl.hpp +++ b/src/mlpack/methods/quic_svd/quic_svd.cpp @@ -4,8 +4,6 @@ * * An implementation of QUIC-SVD. */ -#ifndef MLPACK_METHODS_QUIC_SVD_QUIC_SVD_IMPL_HPP -#define MLPACK_METHODS_QUIC_SVD_QUIC_SVD_IMPL_HPP // In case it hasn't been included yet. #include "quic_svd.hpp" @@ -34,6 +32,9 @@ QUIC_SVD::QUIC_SVD(const arma::mat& dataset, // Get subspace basis by creating the cosine tree. ctree->GetFinalBasis(basis); + // Delete cosine tree. + delete ctree; + // Use the ExtractSVD algorithm mentioned in the paper to extract the SVD of // the original dataset in the obtained subspace. ExtractSVD(u, v, sigma); @@ -76,5 +77,3 @@ void QUIC_SVD::ExtractSVD(arma::mat& u, } // namespace svd } // namespace mlpack - -#endif diff --git a/src/mlpack/methods/quic_svd/quic_svd.hpp b/src/mlpack/methods/quic_svd/quic_svd.hpp index 3fe9e6d2b11..d470bfe6f8f 100644 --- a/src/mlpack/methods/quic_svd/quic_svd.hpp +++ b/src/mlpack/methods/quic_svd/quic_svd.hpp @@ -77,9 +77,7 @@ class QUIC_SVD * @param v Second unitary matrix. * @param sigma Diagonal matrix of singular values. */ - void ExtractSVD(arma::mat& u, - arma::mat& v, - arma::mat& sigma); + void ExtractSVD(arma::mat& u, arma::mat& v, arma::mat& sigma); private: //! Matrix for which cosine tree is constructed. @@ -91,7 +89,4 @@ class QUIC_SVD } // namespace svd } // namespace mlpack -// Include implementation. -#include "quic_svd_impl.hpp" - #endif diff --git a/src/mlpack/methods/randomized_svd/CMakeLists.txt b/src/mlpack/methods/randomized_svd/CMakeLists.txt new file mode 100644 index 00000000000..ad0e5b34faf --- /dev/null +++ b/src/mlpack/methods/randomized_svd/CMakeLists.txt @@ -0,0 +1,15 @@ +# Define the files we need to compile. +# Anything not in this list will not be compiled into mlpack. +set(SOURCES + randomized_svd.hpp + randomized_svd.cpp +) + +# Add directory name to sources. +set(DIR_SRCS) +foreach(file ${SOURCES}) + set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) +endforeach() +# Append sources (with directory name) to list of all mlpack sources (used at +# the parent scope). +set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) diff --git a/src/mlpack/methods/randomized_svd/randomized_svd.cpp b/src/mlpack/methods/randomized_svd/randomized_svd.cpp new file mode 100644 index 00000000000..64d62ee157a --- /dev/null +++ b/src/mlpack/methods/randomized_svd/randomized_svd.cpp @@ -0,0 +1,128 @@ +/** + * @file randomized_svd.cpp + * @author Marcus Edel + * + * Implementation of the randomized SVD method. + */ + +#include "randomized_svd.hpp" + +namespace mlpack { +namespace svd { + +RandomizedSVD::RandomizedSVD(const arma::mat& data, + arma::mat& u, + arma::vec& s, + arma::mat& v, + const size_t iteratedPower, + const size_t maxIterations, + const size_t rank) : + iteratedPower(iteratedPower), + maxIterations(maxIterations) +{ + if (rank == 0) + { + Apply(data, u, s, v, data.n_rows); + } + else + { + Apply(data, u, s, v, rank); + } +} + +RandomizedSVD::RandomizedSVD(const size_t iteratedPower, + const size_t maxIterations) : + iteratedPower(iteratedPower), + maxIterations(maxIterations) +{ + /* Nothing to do here */ +} + +void RandomizedSVD::Apply(const arma::mat& data, + arma::mat& u, + arma::vec& s, + arma::mat& v, + const size_t rank) +{ + if (iteratedPower == 0) + iteratedPower = rank + 2; + + // Center the data into a temporary matrix. + arma::vec rowMean = arma::sum(data, 1) / data.n_cols; + + arma::mat R, Q, Qdata; + ann::RandomInitialization randomInit; + + // Apply the centered data matrix to a random matrix, obtaining Q. + if (data.n_cols >= data.n_rows) + { + randomInit.Initialize(R, data.n_rows, iteratedPower); + Q = (data.t() * R) - arma::repmat(arma::trans(R.t() * rowMean), + data.n_cols, 1); + } + else + { + randomInit.Initialize(R, data.n_cols, iteratedPower); + Q = (data * R) - (rowMean * (arma::ones(1, data.n_cols) * R)); + } + + // Form a matrix Q whose columns constitute a + // well-conditioned basis for the columns of the earlier Q. + if (maxIterations == 0) + { + arma::qr_econ(Q, v, Q); + } + else + { + arma::lu(Q, v, Q); + } + + // Perform normalized power iterations. + for (size_t i = 0; i < maxIterations; ++i) + { + if (data.n_cols >= data.n_rows) + { + Q = (data * Q) - rowMean * (arma::ones(1, data.n_cols) * Q); + arma::lu(Q, v, Q); + Q = (data.t() * Q) - arma::repmat(rowMean.t() * Q, data.n_cols, 1); + } + else + { + Q = (data.t() * Q) - arma::repmat(rowMean.t() * Q, data.n_cols, 1); + arma::lu(Q, v, Q); + Q = (data * Q) - (rowMean * (arma::ones(1, data.n_cols) * Q)); + } + + // Computing the LU decomposition is more efficient than computing the QR + // decomposition, so we only use in the last iteration, a pivoted QR + // decomposition which renormalizes Q, ensuring that the columns of Q are + // orthonormal. + if (i < (maxIterations - 1)) + { + arma::lu(Q, v, Q); + } + else + { + arma::qr_econ(Q, v, Q); + } + } + + // Do economical singular value decomposition and compute only the + // approximations of the left singular vectors by using the centered data + // applied to Q. + if (data.n_cols >= data.n_rows) + { + Qdata = (data * Q) - rowMean * (arma::ones(1, data.n_cols) * Q); + arma::svd_econ(u, s, v, Qdata); + v = Q * v; + } + else + { + Qdata = (Q.t() * data) - arma::repmat(Q.t() * rowMean, 1, data.n_cols); + arma::svd_econ(u, s, v, Qdata); + u = Q * u; + } +} + +} // namespace svd +} // namespace mlpack diff --git a/src/mlpack/methods/randomized_svd/randomized_svd.hpp b/src/mlpack/methods/randomized_svd/randomized_svd.hpp new file mode 100644 index 00000000000..c175fd29211 --- /dev/null +++ b/src/mlpack/methods/randomized_svd/randomized_svd.hpp @@ -0,0 +1,133 @@ +/** + * @file randomized_svd.hpp + * @author Marcus Edel + * + * An implementation of the randomized SVD method. + */ + +#ifndef MLPACK_METHODS_RANDOMIZED_SVD_RANDOMIZED_SVD_HPP +#define MLPACK_METHODS_RANDOMIZED_SVD_RANDOMIZED_SVD_HPP + +#include +#include + +namespace mlpack { +namespace svd { + +/** + * Randomized SVD is a matrix factorization that is based on randomized matrix + * approximation techniques, developed in in "Finding structure with randomness: + * Probabilistic algorithms for constructing approximate matrix decompositions". + * + * For more information, see the following. + * + * @code + * @article{Halko2011, + * author = {Halko, N. and Martinsson, P. G. and Tropp, J. A.}, + * title = {Finding Structure with Randomness: Probabilistic Algorithms for + Constructing Approximate Matrix Decompositions}, + * journal = {SIAM Rev.}, + * volume = {53}, + * year = {2011}, + * } + * @endcode + * + * @code + * @article{Szlam2014, + * author = {Arthur Szlam Yuval Kluger and Mark Tygert}, + * title = {An implementation of a randomized algorithm for principal + component analysis}, + * journal = {CoRR}, + * volume = {abs/1412.3510}, + * year = {2014}, + * } + * @endcode + * + * An example of how to use the interface is shown below: + * + * @code + * arma::mat data; // Rating data in the form of coordinate list. + * + * const size_t rank = 20; // Rank used for the decomposition. + * + * // Make a RandomizedSVD object. + * RandomizedSVD rSVD(); + * + * arma::mat u, s, v; + * + * // Use the Apply() method to get a factorization. + * rSVD.Apply(data, u, s, v, rank); + * @endcode + */ +class RandomizedSVD +{ + public: + /** + * Create object for the randomized SVD method. + * + * @param data Data matrix. + * @param u First unitary matrix. + * @param v Second unitary matrix. + * @param sigma Diagonal matrix of singular values. + * @param iteratedPower Size of the normalized power iterations + * (Default: rank + 2). + * @param maxIterations Number of iterations for the power method + * (Default: 2). + * @param rank Rank of the approximation (Default: number of rows.) + */ + RandomizedSVD(const arma::mat& data, + arma::mat& u, + arma::vec& s, + arma::mat& v, + const size_t iteratedPower = 0, + const size_t maxIterations = 2, + const size_t rank = 0); + + /** + * Create object for the randomized SVD method. + * + * @param iteratedPower Size of the normalized power iterations + * (Default: rank + 2). + * @param maxIterations Number of iterations for the power method + * (Default: 2). + */ + RandomizedSVD(const size_t iteratedPower = 0, const size_t maxIterations = 2); + + /** + * Apply Principal Component Analysis to the provided data set using the + * randomized SVD. + * + * @param data Data matrix. + * @param u First unitary matrix. + * @param v Second unitary matrix. + * @param sigma Diagonal matrix of singular values. + * @param rank Rank of the approximation. + */ + void Apply(const arma::mat& data, + arma::mat& u, + arma::vec& s, + arma::mat& v, + const size_t rank); + + //! Get the size of the normalized power iterations. + size_t IteratedPower() const { return iteratedPower; } + //! Modify the size of the normalized power iterations. + size_t& IteratedPower() { return iteratedPower; } + + //! Get the number of iterations for the power method. + size_t MaxIterations() const { return maxIterations; } + //! Modify the number of iterations for the power method. + size_t& MaxIterations() { return maxIterations; } + + private: + //! Locally stored size of the normalized power iterations. + size_t iteratedPower; + + //! Locally stored number of iterations for the power method. + size_t maxIterations; +}; + +} // namespace svd +} // namespace mlpack + +#endif diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index 967edeee4e1..b13494dff09 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -57,7 +57,9 @@ add_executable(mlpack_test perceptron_test.cpp quic_svd_test.cpp radical_test.cpp + randomized_svd_test.cpp range_search_test.cpp + recurrent_network_test.cpp rectangle_tree_test.cpp regularized_svd_test.cpp rmsprop_test.cpp @@ -80,7 +82,6 @@ add_executable(mlpack_test svd_incremental_test.cpp nystroem_method_test.cpp armadillo_svd_test.cpp - recurrent_network_test.cpp ) # Link dependencies of test executable. target_link_libraries(mlpack_test diff --git a/src/mlpack/tests/pca_test.cpp b/src/mlpack/tests/pca_test.cpp index e4f3b6a311e..b8bc88ac9dc 100644 --- a/src/mlpack/tests/pca_test.cpp +++ b/src/mlpack/tests/pca_test.cpp @@ -1,37 +1,41 @@ /** * @file pca_test.cpp * @author Ajinkya Kale + * @author Marcus Edel * * Test file for PCA class. */ #include #include +#include +#include +#include #include #include "test_tools.hpp" BOOST_AUTO_TEST_SUITE(PCATest); -using namespace std; using namespace arma; using namespace mlpack; using namespace mlpack::pca; using namespace mlpack::distribution; -/** - * Compare the output of our PCA implementation with Armadillo's. +/* + * Compare the output of the our PCA implementation with Armadillo's using the + * specified decomposition policy. */ -BOOST_AUTO_TEST_CASE(ArmaComparisonPCATest) +template +void ArmaComparisonPCA() { - mat coeff, coeff1; - vec eigVal, eigVal1; - mat score, score1; + arma::mat coeff, coeff1, score, score1; + arma::vec eigVal, eigVal1; - mat data = randu(100, 100); + arma::mat data = arma::randu(3, 1000); - PCA p; + PCAType exactPCA; + exactPCA.Apply(data, score1, eigVal1, coeff1); - p.Apply(data, score1, eigVal1, coeff1); princomp(coeff, score, eigVal, trans(data)); // Verify the PCA results based on the eigenvalues. @@ -44,11 +48,12 @@ BOOST_AUTO_TEST_CASE(ArmaComparisonPCATest) } } -/** +/* * Test that dimensionality reduction with PCA works the same way MATLAB does - * (which should be correct!). + * (which should be correct!) using the specified decomposition policy. */ -BOOST_AUTO_TEST_CASE(PCADimensionalityReductionTest) +template +void PCADimensionalityReduction() { // Fake, simple dataset. The results we will compare against are from MATLAB. mat data("1 0 2 3 9;" @@ -56,7 +61,7 @@ BOOST_AUTO_TEST_CASE(PCADimensionalityReductionTest) "6 7 3 1 8"); // Now run PCA to reduce the dimensionality. - PCA p; + PCAType p; const double varRetained = p.Apply(data, 2); // Reduce to 2 dimensions. // Compare with correct results. @@ -87,11 +92,12 @@ BOOST_AUTO_TEST_CASE(PCADimensionalityReductionTest) /** * Test that setting the variance retained parameter to perform dimensionality - * reduction works. + * reduction works using the specified decomposition policy. */ -BOOST_AUTO_TEST_CASE(PCAVarianceRetainedTest) +template +void PCAVarianceRetained() { - // Fake, simple dataset. + // Fake, simple dataset. mat data("1 0 2 3 9;" "5 2 8 4 8;" "6 7 3 1 8"); @@ -105,7 +111,7 @@ BOOST_AUTO_TEST_CASE(PCAVarianceRetainedTest) // and if we keep two, the actual variance retained is // 0.904876047045906 // and if we keep three, the actual variance retained is 1. - PCA p; + PCAType p; arma::mat origData = data; double varRetained = p.Apply(data, 0.1); @@ -149,6 +155,80 @@ BOOST_AUTO_TEST_CASE(PCAVarianceRetainedTest) BOOST_REQUIRE_CLOSE(varRetained, 1.0, 1e-5); } +/** + * Compare the output of our exact PCA implementation with Armadillo's. + */ +BOOST_AUTO_TEST_CASE(ArmaComparisonExactPCATest) +{ + ArmaComparisonPCA(); +} + +/** + * Compare the output of our randomized-SVD PCA implementation with Armadillo's. + */ +BOOST_AUTO_TEST_CASE(ArmaComparisonRandomizedPCATest) +{ + ArmaComparisonPCA(); +} + +/** + * Test that dimensionality reduction with exact-svd PCA works the same way + * MATLAB does (which should be correct!). + */ +BOOST_AUTO_TEST_CASE(ExactPCADimensionalityReductionTest) +{ + PCADimensionalityReduction(); +} + +/** + * Test that dimensionality reduction with randomized-svd PCA works the same way + * MATLAB does (which should be correct!). + */ +BOOST_AUTO_TEST_CASE(RandomizedPCADimensionalityReductionTest) +{ + PCADimensionalityReduction(); +} + +/** + * Test that dimensionality reduction with QUIC-SVD PCA works the same way + * as the Exact-SVD PCA method. + */ +BOOST_AUTO_TEST_CASE(QUICPCADimensionalityReductionTest) +{ + arma::mat data, data1; + data::Load("test_data_3_1000.csv", data); + data1 = data; + + PCAType exactPCA; + const double varRetainedExact = exactPCA.Apply(data, 1); + + PCAType quicPCA; + const double varRetainedQUIC = quicPCA.Apply(data1, 1); + + BOOST_REQUIRE_CLOSE(varRetainedExact, varRetainedQUIC, 4.0); + + BOOST_REQUIRE_EQUAL(data.n_rows, data1.n_rows); + BOOST_REQUIRE_EQUAL(data.n_cols, data1.n_cols); +} + +/** + * Test that setting the variance retained parameter to perform dimensionality + * reduction works using the exact svd PCA method. + */ +BOOST_AUTO_TEST_CASE(ExactPCAVarianceRetainedTest) +{ + PCAVarianceRetained(); +} + +/** + * Test that setting the variance retained parameter to perform dimensionality + * reduction works using the randomized svd PCA method. + */ +BOOST_AUTO_TEST_CASE(RandomizedPCAVarianceRetainedTest) +{ + PCAVarianceRetained(); +} + /** * Test that scaling PCA works. */ diff --git a/src/mlpack/tests/quic_svd_test.cpp b/src/mlpack/tests/quic_svd_test.cpp index 218bfff4c39..871cf813853 100644 --- a/src/mlpack/tests/quic_svd_test.cpp +++ b/src/mlpack/tests/quic_svd_test.cpp @@ -14,7 +14,6 @@ BOOST_AUTO_TEST_SUITE(QUICSVDTest); using namespace mlpack; -using namespace mlpack::svd; /** * The reconstruction error of the obtained SVD should be small. @@ -27,7 +26,7 @@ BOOST_AUTO_TEST_CASE(QUICSVDReconstructionError) // Obtain the SVD using default parameters. arma::mat u, v, sigma; - QUIC_SVD quicsvd(dataset, u, v, sigma); + svd::QUIC_SVD quicsvd(dataset, u, v, sigma); // Reconstruct the matrix using the SVD. arma::mat reconstruct; @@ -39,4 +38,35 @@ BOOST_AUTO_TEST_CASE(QUICSVDReconstructionError) BOOST_REQUIRE_SMALL(relativeError, 1e-5); } +/** + * The singular value error of the obtained SVD should be small. + */ +BOOST_AUTO_TEST_CASE(QUICSVDSigularValueError) +{ + arma::mat U = arma::randn(3, 20); + arma::mat V = arma::randn(10, 3); + + arma::mat R; + arma::qr_econ(U, R, U); + arma::qr_econ(V, R, V); + + arma::mat s = arma::diagmat(arma::vec("1 0.1 0.01")); + + arma::mat data = arma::trans(U * arma::diagmat(s) * V.t()); + + arma::vec s1, s3; + arma::mat U1, U2, V1, V2, s2; + + // Obtain the SVD using default parameters. + arma::svd_econ(U1, s1, V1, data); + svd::QUIC_SVD quicsvd(data, U1, V1, s2); + + s3 = arma::diagvec(s2); + s1 = s1.subvec(0, s3.n_elem - 1); + + // The sigular value error should be small. + double error = arma::norm(s1 - s3); + BOOST_REQUIRE_SMALL(error, 0.01); +} + BOOST_AUTO_TEST_SUITE_END(); diff --git a/src/mlpack/tests/randomized_svd_test.cpp b/src/mlpack/tests/randomized_svd_test.cpp new file mode 100644 index 00000000000..a911279c525 --- /dev/null +++ b/src/mlpack/tests/randomized_svd_test.cpp @@ -0,0 +1,62 @@ +/** + * @file randomized_svd_test.cpp + * @author Marcus Edel + * + * Test file for the Randomized SVD class. + */ + +#include +#include + +#include +#include "test_tools.hpp" + +BOOST_AUTO_TEST_SUITE(RandomizedSVDTest); + +using namespace mlpack; + +/** + * The reconstruction and sigular value error of the obtained SVD should be + * small. + */ +BOOST_AUTO_TEST_CASE(RandomizedSVDReconstructionError) +{ + arma::mat U = arma::randn(3, 20); + arma::mat V = arma::randn(10, 3); + + arma::mat R; + arma::qr_econ(U, R, U); + arma::qr_econ(V, R, V); + + arma::mat s = arma::diagmat(arma::vec("1 0.1 0.01")); + + arma::mat data = arma::trans(U * arma::diagmat(s) * V.t()); + + // Center the data into a temporary matrix. + arma::mat centeredData; + math::Center(data, centeredData); + + arma::mat U1, U2, V1, V2; + arma::vec s1, s2, s3; + + arma::svd_econ(U1, s1, V1, centeredData); + + svd::RandomizedSVD rSVD(0, 6); + rSVD.Apply(data, U2, s2, V2, 3); + + // Use the same amount of data for the compariosn (matrix rank). + s3 = s1.subvec(0, s2.n_elem - 1); + + // The sigular value error should be small. + double error = arma::norm(s2 - s3, "frob"); + BOOST_REQUIRE_SMALL(error, 1e-5); + + arma::mat reconstruct = U2 * arma::diagmat(s2) * V2.t(); + + // The relative reconstruction error should be small. + error = arma::norm(centeredData - reconstruct, "frob") / + arma::norm(centeredData, "frob"); + BOOST_REQUIRE_SMALL(error, 1e-5); +} + +BOOST_AUTO_TEST_SUITE_END();