From 84990cb1525b7abdf0869b0495891cbb1bdef998 Mon Sep 17 00:00:00 2001 From: theJonan Date: Thu, 6 Oct 2016 12:41:37 +0300 Subject: [PATCH 01/25] - Mac gitignore changes. --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index fbdd115098b..6a6e0b36ecd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ build* +xcode* +.DS_Store src/mlpack/core/util/gitversion.hpp src/mlpack/core/util/arma_config.hpp From 5b13881795d4741ef736854b63b7a413ef48a2c6 Mon Sep 17 00:00:00 2001 From: theJonan Date: Thu, 13 Oct 2016 16:10:43 +0300 Subject: [PATCH 02/25] - DTree class templated. --- src/mlpack/methods/det/CMakeLists.txt | 2 +- src/mlpack/methods/det/dtree.hpp | 98 ++++------ .../methods/det/{dtree.cpp => dtree_impl.hpp} | 178 ++++++++++-------- 3 files changed, 141 insertions(+), 137 deletions(-) rename src/mlpack/methods/det/{dtree.cpp => dtree_impl.hpp} (77%) diff --git a/src/mlpack/methods/det/CMakeLists.txt b/src/mlpack/methods/det/CMakeLists.txt index 73b0474eeb3..66edbe6d620 100644 --- a/src/mlpack/methods/det/CMakeLists.txt +++ b/src/mlpack/methods/det/CMakeLists.txt @@ -4,7 +4,7 @@ set(SOURCES # the DET class dtree.hpp - dtree.cpp + dtree_impl.hpp # the util file dt_utils.hpp diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index 8e8bbdfe965..f39b5bf36fa 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -36,9 +36,17 @@ namespace det /** Density Estimation Trees */ { * } * @endcode */ +template class DTree { public: + /** + * The actual, underlying type we're working with + */ + typedef typename MatType::elem_type ElemType; + /** * Create an empty density estimation tree. */ @@ -52,8 +60,8 @@ class DTree * @param minVals Minimum values of the bounding box. * @param totalPoints Total number of points in the dataset. */ - DTree(const arma::vec& maxVals, - const arma::vec& minVals, + DTree(const VecType& maxVals, + const VecType& minVals, const size_t totalPoints); /** @@ -64,7 +72,7 @@ class DTree * * @param data Dataset to build tree on. */ - DTree(arma::mat& data); + DTree(MatType& data); /** * Create a child node of a density estimation tree given the bounding box @@ -78,8 +86,8 @@ class DTree * @param end End of points represented by this node in the data matrix. * @param error log-negative error of this node. */ - DTree(const arma::vec& maxVals, - const arma::vec& minVals, + DTree(const VecType& maxVals, + const VecType& minVals, const size_t start, const size_t end, const double logNegError); @@ -95,8 +103,8 @@ class DTree * @param start Start of points represented by this node in the data matrix. * @param end End of points represented by this node in the data matrix. */ - DTree(const arma::vec& maxVals, - const arma::vec& minVals, + DTree(const VecType& maxVals, + const VecType& minVals, const size_t totalPoints, const size_t start, const size_t end); @@ -114,7 +122,7 @@ class DTree * @param maxLeafSize Maximum size of a leaf. * @param minLeafSize Minimum size of a leaf. */ - double Grow(arma::mat& data, + double Grow(MatType& data, arma::Col& oldFromNew, const bool useVolReg = false, const size_t maxLeafSize = 10, @@ -137,16 +145,7 @@ class DTree * * @param query Point to estimate density of. */ - double ComputeValue(const arma::vec& query) const; - - /** - * Print the tree in a depth-first manner (this function is called - * recursively). - * - * @param fp File to write the tree to. - * @param level Level of the tree (should start at 0). - */ - void WriteTree(FILE *fp, const size_t level = 0) const; + double ComputeValue(const VecType& query) const; /** * Index the buckets for possible usage later; this results in every leaf in @@ -155,7 +154,7 @@ class DTree * * @param tag Tag for the next leaf; leave at 0 for the initial call. */ - int TagTree(const int tag = 0); + TagType TagTree(const TagType tag = 0); /** * Return the tag of the leaf containing the query. This is useful for @@ -163,7 +162,7 @@ class DTree * * @param query Query to search for. */ - int FindBucket(const arma::vec& query) const; + TagType FindBucket(const VecType& query) const; /** * Compute the variable importance of each dimension in the learned tree. @@ -183,7 +182,7 @@ class DTree /** * Return whether a query point is within the range of this node. */ - bool WithinRange(const arma::vec& query) const; + bool WithinRange(const VecType& query) const; private: // The indices in the complete set of points @@ -208,7 +207,7 @@ class DTree size_t splitDim; //! The split value on the splitting dimension for this node. - double splitValue; + ElemType splitValue; //! log-negative-L2-error of the node. double logNegError; @@ -229,7 +228,7 @@ class DTree double logVolume; //! The tag for the leaf, used for hashing points. - int bucketTag; + TagType bucketTag; //! Upper part of alpha sum; used for pruning. double alphaUpper; @@ -247,7 +246,7 @@ class DTree //! Return the split dimension of this node. size_t SplitDim() const { return splitDim; } //! Return the split value of this node. - double SplitValue() const { return splitValue; } + ElemType SplitValue() const { return splitValue; } //! Return the log negative error of this node. double LogNegError() const { return logNegError; } //! Return the log negative error of all descendants of this node. @@ -267,51 +266,24 @@ class DTree bool Root() const { return root; } //! Return the upper part of the alpha sum. double AlphaUpper() const { return alphaUpper; } + //! Return the current bucket's ID, if leaf, or -1 otherwise + TagType BucketTag() const { return subtreeLeaves == 1 ? bucketTag : -1; } //! Return the maximum values. - const arma::vec& MaxVals() const { return maxVals; } + const VecType& MaxVals() const { return maxVals; } //! Modify the maximum values. - arma::vec& MaxVals() { return maxVals; } + VecType& MaxVals() { return maxVals; } //! Return the minimum values. - const arma::vec& MinVals() const { return minVals; } + const VecType& MinVals() const { return minVals; } //! Modify the minimum values. - arma::vec& MinVals() { return minVals; } + VecType& MinVals() { return minVals; } /** * Serialize the density estimation tree. */ template - void Serialize(Archive& ar, const unsigned int /* version */) - { - using data::CreateNVP; - - ar & CreateNVP(start, "start"); - ar & CreateNVP(end, "end"); - ar & CreateNVP(maxVals, "maxVals"); - ar & CreateNVP(minVals, "minVals"); - ar & CreateNVP(splitDim, "splitDim"); - ar & CreateNVP(splitValue, "splitValue"); - ar & CreateNVP(logNegError, "logNegError"); - ar & CreateNVP(subtreeLeavesLogNegError, "subtreeLeavesLogNegError"); - ar & CreateNVP(subtreeLeaves, "subtreeLeaves"); - ar & CreateNVP(root, "root"); - ar & CreateNVP(ratio, "ratio"); - ar & CreateNVP(logVolume, "logVolume"); - ar & CreateNVP(bucketTag, "bucketTag"); - ar & CreateNVP(alphaUpper, "alphaUpper"); - - if (Archive::is_loading::value) - { - if (left) - delete left; - if (right) - delete right; - } - - ar & CreateNVP(left, "left"); - ar & CreateNVP(right, "right"); - } + void Serialize(Archive& ar, const unsigned int /* version */); private: @@ -320,9 +292,9 @@ class DTree /** * Find the dimension to split on. */ - bool FindSplit(const arma::mat& data, + bool FindSplit(const MatType& data, size_t& splitDim, - double& splitValue, + ElemType& splitValue, double& leftError, double& rightError, const size_t minLeafSize = 5) const; @@ -330,9 +302,9 @@ class DTree /** * Split the data, returning the number of points left of the split. */ - size_t SplitData(arma::mat& data, + size_t SplitData(MatType& data, const size_t splitDim, - const double splitValue, + const ElemType splitValue, arma::Col& oldFromNew) const; }; @@ -340,4 +312,6 @@ class DTree } // namespace det } // namespace mlpack +#include "dtree_impl.hpp" + #endif // MLPACK_METHODS_DET_DTREE_HPP diff --git a/src/mlpack/methods/det/dtree.cpp b/src/mlpack/methods/det/dtree_impl.hpp similarity index 77% rename from src/mlpack/methods/det/dtree.cpp rename to src/mlpack/methods/det/dtree_impl.hpp index c88d480b704..5ef37587e95 100644 --- a/src/mlpack/methods/det/dtree.cpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -12,7 +12,8 @@ using namespace mlpack; using namespace det; -DTree::DTree() : +template +DTree::DTree() : start(0), end(0), splitDim(size_t(-1)), @@ -31,9 +32,11 @@ DTree::DTree() : // Root node initializers -DTree::DTree(const arma::vec& maxVals, - const arma::vec& minVals, - const size_t totalPoints) : + +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t totalPoints) : start(0), end(totalPoints), maxVals(maxVals), @@ -52,7 +55,8 @@ DTree::DTree(const arma::vec& maxVals, right(NULL) { /* Nothing to do. */ } -DTree::DTree(arma::mat& data) : +template +DTree::DTree(MatType & data) : start(0), end(data.n_cols), splitDim(size_t(-1)), @@ -88,11 +92,12 @@ DTree::DTree(arma::mat& data) : // Non-root node initializers -DTree::DTree(const arma::vec& maxVals, - const arma::vec& minVals, - const size_t start, - const size_t end, - const double logNegError) : +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t start, + const size_t end, + const double logNegError) : start(start), end(end), maxVals(maxVals), @@ -111,11 +116,12 @@ DTree::DTree(const arma::vec& maxVals, right(NULL) { /* Nothing to do. */ } -DTree::DTree(const arma::vec& maxVals, - const arma::vec& minVals, - const size_t totalPoints, - const size_t start, - const size_t end) : +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t totalPoints, + const size_t start, + const size_t end) : start(start), end(end), maxVals(maxVals), @@ -134,7 +140,8 @@ DTree::DTree(const arma::vec& maxVals, right(NULL) { /* Nothing to do. */ } -DTree::~DTree() +template +DTree::~DTree() { delete left; delete right; @@ -142,7 +149,8 @@ DTree::~DTree() // This function computes the log-l2-negative-error of a given node from the // formula R(t) = log(|t|^2 / (N^2 V_t)). -double DTree::LogNegativeError(const size_t totalPoints) const +template +double DTree::LogNegativeError(const size_t totalPoints) const { // log(-|t|^2 / (N^2 V_t)) = log(-1) + 2 log(|t|) - 2 log(N) - log(V_t). double err = 2 * std::log((double) (end - start)) - @@ -162,12 +170,13 @@ double DTree::LogNegativeError(const size_t totalPoints) const // This function finds the best split with respect to the L2-error, by trying // all possible splits. The dataset is the full data set but the start and // end are used to obtain the point in this node. -bool DTree::FindSplit(const arma::mat& data, - size_t& splitDim, - double& splitValue, - double& leftError, - double& rightError, - const size_t minLeafSize) const +template +bool DTree::FindSplit(const MatType& data, + size_t& splitDim, + ElemType& splitValue, + double& leftError, + double& rightError, + const size_t minLeafSize) const { // Ensure the dimensionality of the data is the same as the dimensionality of // the bounding rectangle. @@ -180,12 +189,20 @@ bool DTree::FindSplit(const arma::mat& data, bool splitFound = false; // Loop through each dimension. - for (size_t dim = 0; dim < maxVals.n_elem; dim++) +#ifdef _WIN32 + #pragma omp parallel for default(none) \ + shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) + for (intmax_t dim = 0; fold < (intmax_t) maxVals.n_elem; ++dim) +#else + #pragma omp parallel for default(none) \ + shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) + for (size_t dim = 0; dim < maxVals.n_elem; ++dim) +#endif { // Have to deal with REAL, INTEGER, NOMINAL data differently, so we have to // think of how to do that... - const double min = minVals[dim]; - const double max = maxVals[dim]; + const ElemType min = minVals[dim]; + const ElemType max = maxVals[dim]; // If there is nothing to split in this dimension, move on. if (max - min == 0.0) @@ -197,7 +214,7 @@ bool DTree::FindSplit(const arma::mat& data, double minDimError = std::pow(points, 2.0) / (max - min); double dimLeftError = 0.0; // For -Wuninitialized. These variables will double dimRightError = 0.0; // always be set to something else before use. - double dimSplitValue = 0.0; + ElemType dimSplitValue = 0.0; // Find the log volume of all the other dimensions. double volumeWithoutDim = logVolume - std::log(max - min); @@ -214,7 +231,7 @@ bool DTree::FindSplit(const arma::mat& data, { // This makes sense for real continuous data. This kinda corrupts the // data and estimation if the data is ordinal. - const double split = (dimVec[i] + dimVec[i + 1]) / 2.0; + const ElemType split = (dimVec[i] + dimVec[i + 1]) / 2.0; if (split == dimVec[i]) continue; // We can't split here (two points are the same). @@ -269,10 +286,11 @@ bool DTree::FindSplit(const arma::mat& data, return splitFound; } -size_t DTree::SplitData(arma::mat& data, - const size_t splitDim, - const double splitValue, - arma::Col& oldFromNew) const +template +size_t DTree::SplitData(MatType& data, + const size_t splitDim, + const double splitValue, + arma::Col& oldFromNew) const { // Swap all columns such that any columns with value in dimension splitDim // less than or equal to splitValue are on the left side, and all others are @@ -303,11 +321,12 @@ size_t DTree::SplitData(arma::mat& data, } // Greedily expand the tree -double DTree::Grow(arma::mat& data, - arma::Col& oldFromNew, - const bool useVolReg, - const size_t maxLeafSize, - const size_t minLeafSize) +template +double DTree::Grow(MatType& data, + arma::Col& oldFromNew, + const bool useVolReg, + const size_t maxLeafSize, + const size_t minLeafSize) { Log::Assert(data.n_rows == maxVals.n_elem); Log::Assert(data.n_rows == minVals.n_elem); @@ -450,10 +469,10 @@ double DTree::Grow(arma::mat& data, } -double DTree::PruneAndUpdate(const double oldAlpha, - const size_t points, - const bool useVolReg) - +template +double DTree::PruneAndUpdate(const double oldAlpha, + const size_t points, + const bool useVolReg) { // Compute gT. if (subtreeLeaves == 1) // If we are a leaf... @@ -565,7 +584,8 @@ double DTree::PruneAndUpdate(const double oldAlpha, // // Future improvement: Open up the range with epsilons on both sides where // epsilon depends on the density near the boundary. -bool DTree::WithinRange(const arma::vec& query) const +template +bool DTree::WithinRange(const VecType& query) const { for (size_t i = 0; i < query.n_elem; ++i) if ((query[i] < minVals[i]) || (query[i] > maxVals[i])) @@ -575,7 +595,8 @@ bool DTree::WithinRange(const arma::vec& query) const } -double DTree::ComputeValue(const arma::vec& query) const +template +double DTree::ComputeValue(const VecType& query) const { Log::Assert(query.n_elem == maxVals.n_elem); @@ -607,35 +628,9 @@ double DTree::ComputeValue(const arma::vec& query) const } -void DTree::WriteTree(FILE *fp, const size_t level) const -{ - if (subtreeLeaves > 1) - { - fprintf(fp, "\n"); - for (size_t i = 0; i < level; ++i) - fprintf(fp, "|\t"); - fprintf(fp, "Var. %zu > %lg", splitDim, splitValue); - - right->WriteTree(fp, level + 1); - - fprintf(fp, "\n"); - for (size_t i = 0; i < level; ++i) - fprintf(fp, "|\t"); - fprintf(fp, "Var. %zu <= %lg ", splitDim, splitValue); - - left->WriteTree(fp, level); - } - else // If we are a leaf... - { - fprintf(fp, ": f(x)=%lg", std::exp(std::log(ratio) - logVolume)); - if (bucketTag != -1) - fprintf(fp, " BT:%d", bucketTag); - } -} - - // Index the buckets for possible usage later. -int DTree::TagTree(const int tag) +template +TagType DTree::TagTree(const TagType tag) { if (subtreeLeaves == 1) { @@ -650,7 +645,8 @@ int DTree::TagTree(const int tag) } -int DTree::FindBucket(const arma::vec& query) const +template +TagType DTree::FindBucket(const VecType& query) const { Log::Assert(query.n_elem == maxVals.n_elem); @@ -670,8 +666,8 @@ int DTree::FindBucket(const arma::vec& query) const } } - -void DTree::ComputeVariableImportance(arma::vec& importances) const +template +void DTree::ComputeVariableImportance(arma::vec& importances) const { // Clear and set to right size. importances.zeros(maxVals.n_elem); @@ -697,3 +693,37 @@ void DTree::ComputeVariableImportance(arma::vec& importances) const nodes.push(curNode.Right()); } } + +template +template +void DTree::Serialize(Archive& ar, const unsigned int /* version */) +{ + using data::CreateNVP; + + ar & CreateNVP(start, "start"); + ar & CreateNVP(end, "end"); + ar & CreateNVP(maxVals, "maxVals"); + ar & CreateNVP(minVals, "minVals"); + ar & CreateNVP(splitDim, "splitDim"); + ar & CreateNVP(splitValue, "splitValue"); + ar & CreateNVP(logNegError, "logNegError"); + ar & CreateNVP(subtreeLeavesLogNegError, "subtreeLeavesLogNegError"); + ar & CreateNVP(subtreeLeaves, "subtreeLeaves"); + ar & CreateNVP(root, "root"); + ar & CreateNVP(ratio, "ratio"); + ar & CreateNVP(logVolume, "logVolume"); + ar & CreateNVP(bucketTag, "bucketTag"); + ar & CreateNVP(alphaUpper, "alphaUpper"); + + if (Archive::is_loading::value) + { + if (left) + delete left; + if (right) + delete right; + } + + ar & CreateNVP(left, "left"); + ar & CreateNVP(right, "right"); +} + From 8b4c90729c8a787fc528b0e66aedb5c864e4cf0d Mon Sep 17 00:00:00 2001 From: theJonan Date: Thu, 13 Oct 2016 19:53:50 +0300 Subject: [PATCH 03/25] - First successfull builtd. --- src/mlpack/methods/det/CMakeLists.txt | 4 - src/mlpack/methods/det/det_main.cpp | 4 +- src/mlpack/methods/det/dt_utils.hpp | 23 ++- .../det/{dt_utils.cpp => dt_utils_impl.hpp} | 59 +++--- src/mlpack/methods/det/dtree.hpp | 8 +- src/mlpack/methods/det/dtree_impl.hpp | 171 +++++++++++------- 6 files changed, 153 insertions(+), 116 deletions(-) rename src/mlpack/methods/det/{dt_utils.cpp => dt_utils_impl.hpp} (83%) diff --git a/src/mlpack/methods/det/CMakeLists.txt b/src/mlpack/methods/det/CMakeLists.txt index 66edbe6d620..4dd3bc32adb 100644 --- a/src/mlpack/methods/det/CMakeLists.txt +++ b/src/mlpack/methods/det/CMakeLists.txt @@ -5,10 +5,6 @@ set(SOURCES # the DET class dtree.hpp dtree_impl.hpp - - # the util file - dt_utils.hpp - dt_utils.cpp ) # add directory name to sources diff --git a/src/mlpack/methods/det/det_main.cpp b/src/mlpack/methods/det/det_main.cpp index f1b33bac5c5..26b394d7802 100644 --- a/src/mlpack/methods/det/det_main.cpp +++ b/src/mlpack/methods/det/det_main.cpp @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) << "(-T) is not specified." << endl; // Are we training a DET or loading from file? - DTree* tree; + DTree* tree; if (CLI::HasParam("training_file")) { const string trainSetFile = CLI::GetParam("training_file"); @@ -127,7 +127,7 @@ int main(int argc, char *argv[]) // Obtain the optimal tree. Timer::Start("det_training"); - tree = Trainer(trainingData, folds, regularization, maxLeafSize, + tree = Trainer(trainingData, folds, regularization, maxLeafSize, minLeafSize, ""); Timer::Stop("det_training"); diff --git a/src/mlpack/methods/det/dt_utils.hpp b/src/mlpack/methods/det/dt_utils.hpp index c20f78adac6..067e6fe31d7 100644 --- a/src/mlpack/methods/det/dt_utils.hpp +++ b/src/mlpack/methods/det/dt_utils.hpp @@ -25,8 +25,9 @@ namespace det { * @param numClasses Number of classes in dataset. * @param leafClassMembershipFile Name of file to print to (optional). */ -void PrintLeafMembership(DTree* dtree, - const arma::mat& data, +template +void PrintLeafMembership(DTree* dtree, + const MatType& data, const arma::Mat& labels, const size_t numClasses, const std::string leafClassMembershipFile = ""); @@ -39,7 +40,8 @@ void PrintLeafMembership(DTree* dtree, * @param dtree Density tree to use. * @param viFile Name of file to print to (optional). */ -void PrintVariableImportance(const DTree* dtree, +template +void PrintVariableImportance(const DTree* dtree, const std::string viFile = ""); /** @@ -54,14 +56,17 @@ void PrintVariableImportance(const DTree* dtree, * @param minLeafSize Minimum number of points allowed in a leaf. * @param unprunedTreeOutput Filename to print unpruned tree to (optional). */ -DTree* Trainer(arma::mat& dataset, - const size_t folds, - const bool useVolumeReg = false, - const size_t maxLeafSize = 10, - const size_t minLeafSize = 5, - const std::string unprunedTreeOutput = ""); +template +DTree* Trainer(MatType& dataset, + const size_t folds, + const bool useVolumeReg = false, + const size_t maxLeafSize = 10, + const size_t minLeafSize = 5, + const std::string unprunedTreeOutput = ""); } // namespace det } // namespace mlpack +#include "dt_utils_impl.hpp" + #endif // MLPACK_METHODS_DET_DT_UTILS_HPP diff --git a/src/mlpack/methods/det/dt_utils.cpp b/src/mlpack/methods/det/dt_utils_impl.hpp similarity index 83% rename from src/mlpack/methods/det/dt_utils.cpp rename to src/mlpack/methods/det/dt_utils_impl.hpp index b8946a92d85..4c057f76dc9 100644 --- a/src/mlpack/methods/det/dt_utils.cpp +++ b/src/mlpack/methods/det/dt_utils_impl.hpp @@ -10,22 +10,23 @@ using namespace mlpack; using namespace det; -void mlpack::det::PrintLeafMembership(DTree* dtree, - const arma::mat& data, +template +void mlpack::det::PrintLeafMembership(DTree* dtree, + const MatType& data, const arma::Mat& labels, const size_t numClasses, const std::string leafClassMembershipFile) { // Tag the leaves with numbers. - int numLeaves = dtree->TagTree(); + TagType numLeaves = dtree->TagTree(); arma::Mat table(numLeaves, (numClasses + 1)); table.zeros(); for (size_t i = 0; i < data.n_cols; i++) { - const arma::vec testPoint = data.unsafe_col(i); - const int leafTag = dtree->FindBucket(testPoint); + const VecType testPoint = data.unsafe_col(i); + const TagType leafTag = dtree->FindBucket(testPoint); const size_t label = labels[i]; table(leafTag, label) += 1; } @@ -57,8 +58,8 @@ void mlpack::det::PrintLeafMembership(DTree* dtree, return; } - -void mlpack::det::PrintVariableImportance(const DTree* dtree, +template +void mlpack::det::PrintVariableImportance(const DTree* dtree, const std::string viFile) { arma::vec imps; @@ -96,15 +97,16 @@ void mlpack::det::PrintVariableImportance(const DTree* dtree, // This function trains the optimal decision tree using the given number of // folds. -DTree* mlpack::det::Trainer(arma::mat& dataset, - const size_t folds, - const bool useVolumeReg, - const size_t maxLeafSize, - const size_t minLeafSize, - const std::string unprunedTreeOutput) +template +DTree* mlpack::det::Trainer(MatType& dataset, + const size_t folds, + const bool useVolumeReg, + const size_t maxLeafSize, + const size_t minLeafSize, + const std::string unprunedTreeOutput) { // Initialize the tree. - DTree dtree(dataset); + DTree dtree(dataset); // Prepare to grow the tree... arma::Col oldFromNew(dataset.n_cols); @@ -112,7 +114,7 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, oldFromNew[i] = i; // Save the dataset since it would be modified while growing the tree. - arma::mat newDataset(dataset); + MatType newDataset(dataset); // Growing the tree double oldAlpha = 0.0; @@ -148,8 +150,7 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, std::vector > prunedSequence; while (dtree.SubtreeLeaves() > 1) { - std::pair treeSeq(oldAlpha, - dtree.SubtreeLeavesLogNegError()); + std::pair treeSeq(oldAlpha, dtree.SubtreeLeavesLogNegError()); prunedSequence.push_back(treeSeq); oldAlpha = alpha; alpha = dtree.PruneAndUpdate(oldAlpha, dataset.n_cols, useVolumeReg); @@ -157,20 +158,18 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, // Some sanity checks. It seems that on some datasets, the error does not // increase as the tree is pruned but instead stays the same---hence the // "<=" in the final assert. - Log::Assert((alpha < std::numeric_limits::max()) || - (dtree.SubtreeLeaves() == 1)); + Log::Assert((alpha < std::numeric_limits::max()) || (dtree.SubtreeLeaves() == 1)); Log::Assert(alpha > oldAlpha); Log::Assert(dtree.SubtreeLeavesLogNegError() <= treeSeq.second); } - std::pair treeSeq(oldAlpha, - dtree.SubtreeLeavesLogNegError()); + std::pair treeSeq(oldAlpha, dtree.SubtreeLeavesLogNegError()); prunedSequence.push_back(treeSeq); Log::Info << prunedSequence.size() << " trees in the sequence; maximum alpha:" << " " << oldAlpha << "." << std::endl; - arma::mat cvData(dataset); + MatType cvData(dataset); size_t testSize = dataset.n_cols / folds; arma::vec regularizationConstants(prunedSequence.size()); @@ -194,8 +193,8 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, size_t start = fold * testSize; size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols); - arma::mat test = cvData.cols(start, end - 1); - arma::mat train(cvData.n_rows, cvData.n_cols - test.n_cols); + MatType test = cvData.cols(start, end - 1); + MatType train(cvData.n_rows, cvData.n_cols - test.n_cols); if (start == 0 && end < cvData.n_cols) { @@ -212,7 +211,7 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, } // Initialize the tree. - DTree cvDTree(train); + DTree cvDTree(train); // Getting ready to grow the tree... arma::Col cvOldFromNew(train.n_cols); @@ -252,13 +251,12 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, double cvVal = 0.0; for (size_t i = 0; i < test.n_cols; ++i) { - arma::vec testPoint = test.unsafe_col(i); + VecType testPoint = test.unsafe_col(i); cvVal += cvDTree.ComputeValue(testPoint); } if (prunedSequence.size() > 2) - cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / - (double) dataset.n_cols; + cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) dataset.n_cols; #pragma omp critical regularizationConstants += cvRegularizationConstants; @@ -285,7 +283,7 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, Log::Info << "Optimal alpha: " << optimalAlpha << "." << std::endl; // Initialize the tree. - DTree* dtreeOpt = new DTree(dataset); + DTree* dtreeOpt = new DTree(dataset); // Getting ready to grow the tree... for (size_t i = 0; i < oldFromNew.n_elem; i++) @@ -296,8 +294,7 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, // Grow the tree. oldAlpha = -DBL_MAX; - alpha = dtreeOpt->Grow(newDataset, oldFromNew, useVolumeReg, maxLeafSize, - minLeafSize); + alpha = dtreeOpt->Grow(newDataset, oldFromNew, useVolumeReg, maxLeafSize, minLeafSize); // Prune with optimal alpha. while ((oldAlpha < optimalAlpha) && (dtreeOpt->SubtreeLeaves() > 1)) diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index f39b5bf36fa..f34750fd35a 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -73,7 +73,7 @@ class DTree * @param data Dataset to build tree on. */ DTree(MatType& data); - + /** * Create a child node of a density estimation tree given the bounding box * specified by maxVals and minVals, using the size given in start and end and @@ -154,7 +154,7 @@ class DTree * * @param tag Tag for the next leaf; leave at 0 for the initial call. */ - TagType TagTree(const TagType tag = 0); + TagType TagTree(const TagType& tag = 0); /** * Return the tag of the leaf containing the query. This is useful for @@ -271,13 +271,9 @@ class DTree //! Return the maximum values. const VecType& MaxVals() const { return maxVals; } - //! Modify the maximum values. - VecType& MaxVals() { return maxVals; } //! Return the minimum values. const VecType& MinVals() const { return minVals; } - //! Modify the minimum values. - VecType& MinVals() { return minVals; } /** * Serialize the density estimation tree. diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 5ef37587e95..58131fb42d3 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -1,4 +1,4 @@ - /** +/** * @file dtree.cpp * @author Parikshit Ram (pram@cc.gatech.edu) * @@ -8,16 +8,91 @@ */ #include "dtree.hpp" #include +#include using namespace mlpack; using namespace det; +namespace detail +{ + template + class DTreeSplit + { + public: + typedef DTreeSplit SplitInfo; + + template + static bool AssignToLeftNode(const VecType& point, + const SplitInfo& splitInfo) + { + return point[splitInfo.splitDimension] < splitInfo.splitVal; + } + + private: + ElemType splitVal; + size_t splitDimension; + }; + + /** + * We need that function, to be able to specialize it for sparse matrices + * in a way which is much faster then usual iteration. + */ + template + void ExtractMinMax(const MatType& data, + VecType& minVals, + VecType& maxVals) + { + // Initialize to first column; values will be overwritten if necessary. + maxVals = data.col(0); + minVals = data.col(0); + + // Loop over data to extract maximum and minimum values in each dimension. + for (size_t i = 1; i < data.n_cols; ++i) + { + for (size_t j = 0; j < data.n_rows; ++j) + { + if (data(j, i) > maxVals[j]) + maxVals[j] = data(j, i); + if (data(j, i) < minVals[j]) + minVals[j] = data(j, i); + } + } + } + + /** + * Here is the optimized specialization + */ + template + void ExtractMinMax(const arma::SpMat& data, + arma::SpCol& minVals, + arma::SpCol& maxVals) + { + // Initialize to first column; values will be overwritten if necessary. + maxVals = data.col(0); + minVals = data.col(0); + + typename arma::sp_mat::iterator dataEnd = data.end(); + + // Loop over data to extract maximum and minimum values in each dimension. + for (typename arma::sp_mat::iterator i = data.begin(); i != dataEnd; ++i) + { + size_t j = i.row(); + if (i.col() == 0) + continue; // we've already taken these values. + else if (*i > maxVals[j]) + maxVals[j] = *i; + else if (*i < minVals[j]) + minVals[j] = *i; + } + } +}; + template DTree::DTree() : start(0), end(0), splitDim(size_t(-1)), - splitValue(DBL_MAX), + splitValue(std::numeric_limits::max()), logNegError(-DBL_MAX), subtreeLeavesLogNegError(-DBL_MAX), subtreeLeaves(0), @@ -42,7 +117,7 @@ DTree::DTree(const VecType& maxVals, maxVals(maxVals), minVals(minVals), splitDim(size_t(-1)), - splitValue(DBL_MAX), + splitValue(std::numeric_limits::max()), logNegError(LogNegativeError(totalPoints)), subtreeLeavesLogNegError(-DBL_MAX), subtreeLeaves(0), @@ -60,7 +135,7 @@ DTree::DTree(MatType & data) : start(0), end(data.n_cols), splitDim(size_t(-1)), - splitValue(DBL_MAX), + splitValue(std::numeric_limits::max()), subtreeLeavesLogNegError(-DBL_MAX), subtreeLeaves(0), root(true), @@ -71,26 +146,10 @@ DTree::DTree(MatType & data) : left(NULL), right(NULL) { - // Initialize to first column; values will be overwritten if necessary. - maxVals = data.col(0); - minVals = data.col(0); - - // Loop over data to extract maximum and minimum values in each dimension. - for (size_t i = 1; i < data.n_cols; ++i) - { - for (size_t j = 0; j < data.n_rows; ++j) - { - if (data(j, i) > maxVals[j]) - maxVals[j] = data(j, i); - if (data(j, i) < minVals[j]) - minVals[j] = data(j, i); - } - } - + detail::ExtractMinMax(data, minVals, maxVals); logNegError = LogNegativeError(data.n_cols); } - // Non-root node initializers template DTree::DTree(const VecType& maxVals, @@ -103,7 +162,7 @@ DTree::DTree(const VecType& maxVals, maxVals(maxVals), minVals(minVals), splitDim(size_t(-1)), - splitValue(DBL_MAX), + splitValue(std::numeric_limits::max()), logNegError(logNegError), subtreeLeavesLogNegError(-DBL_MAX), subtreeLeaves(0), @@ -127,7 +186,7 @@ DTree::DTree(const VecType& maxVals, maxVals(maxVals), minVals(minVals), splitDim(size_t(-1)), - splitValue(DBL_MAX), + splitValue(std::numeric_limits::max()), logNegError(LogNegativeError(totalPoints)), subtreeLeavesLogNegError(-DBL_MAX), subtreeLeaves(0), @@ -156,12 +215,13 @@ double DTree::LogNegativeError(const size_t totalPoin double err = 2 * std::log((double) (end - start)) - 2 * std::log((double) totalPoints); - arma::vec valDiffs = maxVals - minVals; - for (size_t i = 0; i < maxVals.n_elem; ++i) + VecType valDiffs = maxVals - minVals; + typename VecType::iterator valEnd = valDiffs.end(); + for (typename VecType::iterator i = valDiffs.begin(); i != valEnd; ++i) { // Ignore very small dimensions to prevent overflow. - if (valDiffs[i] > 1e-50) - err -= std::log(valDiffs[i]); + if (*i > 1e-50) + err -= std::log(*i); } return err; @@ -191,11 +251,11 @@ bool DTree::FindSplit(const MatType& data, // Loop through each dimension. #ifdef _WIN32 #pragma omp parallel for default(none) \ - shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) - for (intmax_t dim = 0; fold < (intmax_t) maxVals.n_elem; ++dim) + shared(minError, splitFound, points, data, minVals, maxVals, minLeafSize, maxLeafSize) + for (intmax_t dim = 0; dim < (intmax_t) maxVals.n_elem; ++dim) #else #pragma omp parallel for default(none) \ - shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) + shared(minError, splitFound, points, data, minVals, maxVals, minLeafSize, maxLeafSize) for (size_t dim = 0; dim < maxVals.n_elem; ++dim) #endif { @@ -220,7 +280,7 @@ bool DTree::FindSplit(const MatType& data, double volumeWithoutDim = logVolume - std::log(max - min); // Get the values for the dimension. - arma::rowvec dimVec = data.row(dim).subvec(start, end - 1); + VecType dimVec = data.row(dim).subvec(start, end - 1); // Sort the values in ascending order. dimVec = arma::sort(dimVec); @@ -265,9 +325,9 @@ bool DTree::FindSplit(const MatType& data, } } - double actualMinDimError = std::log(minDimError) - - 2 * std::log((double) data.n_cols) - volumeWithoutDim; + double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; +#pragma omp critical if ((actualMinDimError > minError) && dimSplitFound) { // Calculate actual error (in logspace) by adding terms back to our @@ -275,10 +335,8 @@ bool DTree::FindSplit(const MatType& data, minError = actualMinDimError; splitDim = dim; splitValue = dimSplitValue; - leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) - - volumeWithoutDim; - rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) - - volumeWithoutDim; + leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; + rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; splitFound = true; } // end if better split found in this dimension. } @@ -289,7 +347,7 @@ bool DTree::FindSplit(const MatType& data, template size_t DTree::SplitData(MatType& data, const size_t splitDim, - const double splitValue, + const ElemType splitValue, arma::Col& oldFromNew) const { // Swap all columns such that any columns with value in dimension splitDim @@ -310,7 +368,7 @@ size_t DTree::SplitData(MatType& data, data.swap_cols(left, right); - // Store the mapping from old to new. + // Store the mapping from old to new. Do not put std::swap here... const size_t tmp = oldFromNew[left]; oldFromNew[left] = oldFromNew[right]; oldFromNew[right] = tmp; @@ -353,6 +411,7 @@ double DTree::Grow(MatType& data, { // Move the data around for the children to have points in a node lie // contiguously (to increase efficiency during the training). +// const size_t splitIndex = splt::PerformSplit(data, start, end - start, ) const size_t splitIndex = SplitData(data, dim, splitValueTmp, oldFromNew); // Make max and min vals for the children. @@ -372,10 +431,8 @@ double DTree::Grow(MatType& data, left = new DTree(maxValsL, minValsL, start, splitIndex, leftError); right = new DTree(maxValsR, minValsR, splitIndex, end, rightError); - leftG = left->Grow(data, oldFromNew, useVolReg, maxLeafSize, - minLeafSize); - rightG = right->Grow(data, oldFromNew, useVolReg, maxLeafSize, - minLeafSize); + leftG = left->Grow(data, oldFromNew, useVolReg, maxLeafSize, minLeafSize); + rightG = right->Grow(data, oldFromNew, useVolReg, maxLeafSize, minLeafSize); // Store values of R(T~) and |T~|. subtreeLeaves = left->SubtreeLeaves() + right->SubtreeLeaves(); @@ -440,14 +497,12 @@ double DTree::Grow(MatType& data, if (right->SubtreeLeaves() > 1) { - const double exponent = 2 * std::log((double) data.n_cols) + logVolume + - right->AlphaUpper(); + const double exponent = 2 * std::log((double) data.n_cols) + logVolume + right->AlphaUpper(); tmpAlphaSum += std::exp(exponent); } - alphaUpper = std::log(tmpAlphaSum) - 2 * std::log((double) data.n_cols) - - logVolume; + alphaUpper = std::log(tmpAlphaSum) - 2 * std::log((double) data.n_cols) - logVolume; double gT; if (useVolReg) @@ -613,15 +668,8 @@ double DTree::ComputeValue(const VecType& query) cons } else { - if (query[splitDim] <= splitValue) - { - // If left subtree, go to left child. - return left->ComputeValue(query); - } - else // If right subtree, go to right child - { - return right->ComputeValue(query); - } + // Return either of the two children - left or right, depending on the splitValue + return (query[splitDim] <= splitValue) ? left->ComputeValue(query) : right->ComputeValue(query); } return 0.0; @@ -630,7 +678,7 @@ double DTree::ComputeValue(const VecType& query) cons // Index the buckets for possible usage later. template -TagType DTree::TagTree(const TagType tag) +TagType DTree::TagTree(const TagType& tag) { if (subtreeLeaves == 1) { @@ -654,15 +702,10 @@ TagType DTree::FindBucket(const VecType& query) const { return bucketTag; } - else if (query[splitDim] <= splitValue) - { - // If left subtree, go to left child. - return left->FindBucket(query); - } else { - // If right subtree, go to right child. - return right->FindBucket(query); + // Return the tag from either of the two children - left or right. + return (query[splitDim] <= splitValue) ? left->FindBucket(query) : right->FindBucket(query); } } From 0b58fd9a4542b00e12c4525b86d390154631a9ad Mon Sep 17 00:00:00 2001 From: theJonan Date: Fri, 14 Oct 2016 17:42:25 +0300 Subject: [PATCH 04/25] - DET changes propagated to tests. --- src/mlpack/methods/det/dtree_impl.hpp | 29 ++++++++++++++----------- src/mlpack/tests/det_test.cpp | 27 ++++++++++------------- src/mlpack/tests/serialization_test.cpp | 15 +++++++------ 3 files changed, 36 insertions(+), 35 deletions(-) diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 58131fb42d3..2261bbfb436 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -250,12 +250,12 @@ bool DTree::FindSplit(const MatType& data, // Loop through each dimension. #ifdef _WIN32 - #pragma omp parallel for default(none) \ - shared(minError, splitFound, points, data, minVals, maxVals, minLeafSize, maxLeafSize) + #pragma omp parallel for default(shared) \ + shared(splitValue, splitDim, data) for (intmax_t dim = 0; dim < (intmax_t) maxVals.n_elem; ++dim) #else - #pragma omp parallel for default(none) \ - shared(minError, splitFound, points, data, minVals, maxVals, minLeafSize, maxLeafSize) + #pragma omp parallel for default(shared) \ + shared(splitValue, splitDim, data) for (size_t dim = 0; dim < maxVals.n_elem; ++dim) #endif { @@ -327,17 +327,20 @@ bool DTree::FindSplit(const MatType& data, double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; -#pragma omp critical +#pragma omp atomic if ((actualMinDimError > minError) && dimSplitFound) { - // Calculate actual error (in logspace) by adding terms back to our - // estimate. - minError = actualMinDimError; - splitDim = dim; - splitValue = dimSplitValue; - leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; - rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; - splitFound = true; +#pragma omp critical DTreeFindUpdate + { + // Calculate actual error (in logspace) by adding terms back to our + // estimate. + minError = actualMinDimError; + splitDim = dim; + splitValue = dimSplitValue; + leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; + rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; + splitFound = true; + } } // end if better split found in this dimension. } diff --git a/src/mlpack/tests/det_test.cpp b/src/mlpack/tests/det_test.cpp index 7e32defa4d5..2b0ef3788a0 100644 --- a/src/mlpack/tests/det_test.cpp +++ b/src/mlpack/tests/det_test.cpp @@ -42,7 +42,7 @@ BOOST_AUTO_TEST_CASE(TestGetMaxMinVals) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree tree(testData); + DTree tree(testData); BOOST_REQUIRE_EQUAL(tree.maxVals[0], 7); BOOST_REQUIRE_EQUAL(tree.minVals[0], 3); @@ -57,7 +57,7 @@ BOOST_AUTO_TEST_CASE(TestComputeNodeError) arma::vec maxVals("7 7 8"); arma::vec minVals("3 0 1"); - DTree testDTree(maxVals, minVals, 5); + DTree testDTree(maxVals, minVals, 5); double trueNodeError = -log(4.0) - log(7.0) - log(7.0); BOOST_REQUIRE_CLOSE((double) testDTree.logNegError, trueNodeError, 1e-10); @@ -75,7 +75,7 @@ BOOST_AUTO_TEST_CASE(TestWithinRange) arma::vec maxVals("7 7 8"); arma::vec minVals("3 0 1"); - DTree testDTree(maxVals, minVals, 5); + DTree testDTree(maxVals, minVals, 5); arma::vec testQuery(3); testQuery << 4.5 << 2.5 << 2; @@ -95,11 +95,10 @@ BOOST_AUTO_TEST_CASE(TestFindSplit) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree testDTree(testData); + DTree testDTree(testData); size_t obDim, trueDim; - double trueLeftError, obLeftError, trueRightError, obRightError, - obSplit, trueSplit; + double trueLeftError, obLeftError, trueRightError, obRightError, obSplit, trueSplit; trueDim = 2; trueSplit = 5.5; @@ -107,8 +106,7 @@ BOOST_AUTO_TEST_CASE(TestFindSplit) trueRightError = 2 * log(3.0 / 5.0) - (log(7.0) + log(4.0) + log(2.5)); testDTree.logVolume = log(7.0) + log(4.0) + log(7.0); - BOOST_REQUIRE(testDTree.FindSplit(testData, obDim, obSplit, obLeftError, - obRightError, 1)); + BOOST_REQUIRE(testDTree.FindSplit(testData, obDim, obSplit, obLeftError, obRightError, 1)); BOOST_REQUIRE(trueDim == obDim); BOOST_REQUIRE_CLOSE(trueSplit, obSplit, 1e-10); @@ -125,7 +123,7 @@ BOOST_AUTO_TEST_CASE(TestSplitData) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree testDTree(testData); + DTree testDTree(testData); arma::Col oTest(5); oTest << 1 << 2 << 3 << 4 << 5; @@ -133,8 +131,7 @@ BOOST_AUTO_TEST_CASE(TestSplitData) size_t splitDim = 2; double trueSplitVal = 5.5; - size_t splitInd = testDTree.SplitData(testData, splitDim, trueSplitVal, - oTest); + size_t splitInd = testDTree.SplitData(testData, splitDim, trueSplitVal, oTest); BOOST_REQUIRE_EQUAL(splitInd, 2); // 2 points on left side. @@ -169,7 +166,7 @@ BOOST_AUTO_TEST_CASE(TestGrow) rlError = 2 * log(1.0 / 5.0) - (log(0.5) + log(4.0) + log(2.5)); rrError = 2 * log(2.0 / 5.0) - (log(6.5) + log(4.0) + log(2.5)); - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); BOOST_REQUIRE_EQUAL(oTest[0], 0); @@ -222,7 +219,7 @@ BOOST_AUTO_TEST_CASE(TestPruneAndUpdate) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false); @@ -255,7 +252,7 @@ BOOST_AUTO_TEST_CASE(TestComputeValue) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); double d1 = (2.0 / 5.0) / exp(log(4.0) + log(7.0) + log(4.5)); @@ -298,7 +295,7 @@ BOOST_AUTO_TEST_CASE(TestVariableImportance) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); testDTree.Grow(testData, oTest, false, 2, 1); arma::vec imps; diff --git a/src/mlpack/tests/serialization_test.cpp b/src/mlpack/tests/serialization_test.cpp index e6aecc78b43..d7ea76e74fb 100644 --- a/src/mlpack/tests/serialization_test.cpp +++ b/src/mlpack/tests/serialization_test.cpp @@ -853,18 +853,19 @@ BOOST_AUTO_TEST_CASE(SoftmaxRegressionTest) BOOST_AUTO_TEST_CASE(DETTest) { using det::DTree; + typedef DTree DTreeX; // Create a density estimation tree on a random dataset. arma::mat dataset = arma::randu(25, 5000); - DTree tree(dataset); + DTreeX tree(dataset); arma::mat otherDataset = arma::randu(5, 100); - DTree xmlTree, binaryTree, textTree(otherDataset); + DTreeX xmlTree, binaryTree, textTree(otherDataset); SerializeObjectAll(tree, xmlTree, binaryTree, textTree); - std::stack stack, xmlStack, binaryStack, textStack; + std::stack stack, xmlStack, binaryStack, textStack; stack.push(&tree); xmlStack.push(&xmlTree); binaryStack.push(&binaryTree); @@ -873,10 +874,10 @@ BOOST_AUTO_TEST_CASE(DETTest) while (!stack.empty()) { // Get the top node from the stack. - DTree* node = stack.top(); - DTree* xmlNode = xmlStack.top(); - DTree* binaryNode = binaryStack.top(); - DTree* textNode = textStack.top(); + DTreeX* node = stack.top(); + DTreeX* xmlNode = xmlStack.top(); + DTreeX* binaryNode = binaryStack.top(); + DTreeX* textNode = textStack.top(); stack.pop(); xmlStack.pop(); From 47bf92f33a5d71a3d8e54ed8552cb0e7951dc72e Mon Sep 17 00:00:00 2001 From: theJonan Date: Fri, 14 Oct 2016 23:06:19 +0300 Subject: [PATCH 05/25] - DET templating ready and tests passing. --- .../core/arma_extend/Mat_extra_bones.hpp | 9 + .../core/arma_extend/SpMat_extra_bones.hpp | 8 + src/mlpack/methods/det/det_main.cpp | 5 +- src/mlpack/methods/det/dt_utils.hpp | 22 +-- src/mlpack/methods/det/dt_utils_impl.hpp | 22 +-- src/mlpack/methods/det/dtree.hpp | 2 +- src/mlpack/methods/det/dtree_impl.hpp | 187 +++++++----------- src/mlpack/tests/det_test.cpp | 18 +- src/mlpack/tests/serialization_test.cpp | 2 +- 9 files changed, 126 insertions(+), 149 deletions(-) diff --git a/src/mlpack/core/arma_extend/Mat_extra_bones.hpp b/src/mlpack/core/arma_extend/Mat_extra_bones.hpp index e09f5f55b0b..f4f25d610a4 100644 --- a/src/mlpack/core/arma_extend/Mat_extra_bones.hpp +++ b/src/mlpack/core/arma_extend/Mat_extra_bones.hpp @@ -12,6 +12,15 @@ template void serialize(Archive& ar, const unsigned int version); +/** + * These will help us refer the proper vector / column types, only with + * specifying the matrix type we want to use. + */ + +typedef Col vec_type; +typedef Col col_type; +typedef Row row_type; + /* * Add row_col_iterator and row_col_const_iterator to arma::Mat. */ diff --git a/src/mlpack/core/arma_extend/SpMat_extra_bones.hpp b/src/mlpack/core/arma_extend/SpMat_extra_bones.hpp index d3c18dec1e5..a5d274c4b96 100644 --- a/src/mlpack/core/arma_extend/SpMat_extra_bones.hpp +++ b/src/mlpack/core/arma_extend/SpMat_extra_bones.hpp @@ -16,6 +16,14 @@ template void serialize(Archive& ar, const unsigned int version); +/** + * These will help us refer the proper vector / column types, only with + * specifying the matrix type we want to use. + */ +typedef SpCol vec_type; +typedef SpCol col_type; +typedef SpRow row_type; + /* * Extra functions for SpMat * Adding definition of row_col_iterator to generalize with Mat::row_col_iterator diff --git a/src/mlpack/methods/det/det_main.cpp b/src/mlpack/methods/det/det_main.cpp index 26b394d7802..16ffa25b774 100644 --- a/src/mlpack/methods/det/det_main.cpp +++ b/src/mlpack/methods/det/det_main.cpp @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) << "(-T) is not specified." << endl; // Are we training a DET or loading from file? - DTree* tree; + DTree* tree; if (CLI::HasParam("training_file")) { const string trainSetFile = CLI::GetParam("training_file"); @@ -127,8 +127,7 @@ int main(int argc, char *argv[]) // Obtain the optimal tree. Timer::Start("det_training"); - tree = Trainer(trainingData, folds, regularization, maxLeafSize, - minLeafSize, ""); + tree = Trainer(trainingData, folds, regularization, maxLeafSize, minLeafSize, ""); Timer::Stop("det_training"); // Compute training set estimates, if desired. diff --git a/src/mlpack/methods/det/dt_utils.hpp b/src/mlpack/methods/det/dt_utils.hpp index 067e6fe31d7..3535eaeadd7 100644 --- a/src/mlpack/methods/det/dt_utils.hpp +++ b/src/mlpack/methods/det/dt_utils.hpp @@ -25,8 +25,8 @@ namespace det { * @param numClasses Number of classes in dataset. * @param leafClassMembershipFile Name of file to print to (optional). */ -template -void PrintLeafMembership(DTree* dtree, +template +void PrintLeafMembership(DTree* dtree, const MatType& data, const arma::Mat& labels, const size_t numClasses, @@ -40,8 +40,8 @@ void PrintLeafMembership(DTree* dtree, * @param dtree Density tree to use. * @param viFile Name of file to print to (optional). */ -template -void PrintVariableImportance(const DTree* dtree, +template +void PrintVariableImportance(const DTree* dtree, const std::string viFile = ""); /** @@ -56,13 +56,13 @@ void PrintVariableImportance(const DTree* dtree, * @param minLeafSize Minimum number of points allowed in a leaf. * @param unprunedTreeOutput Filename to print unpruned tree to (optional). */ -template -DTree* Trainer(MatType& dataset, - const size_t folds, - const bool useVolumeReg = false, - const size_t maxLeafSize = 10, - const size_t minLeafSize = 5, - const std::string unprunedTreeOutput = ""); +template +DTree* Trainer(MatType& dataset, + const size_t folds, + const bool useVolumeReg = false, + const size_t maxLeafSize = 10, + const size_t minLeafSize = 5, + const std::string unprunedTreeOutput = ""); } // namespace det } // namespace mlpack diff --git a/src/mlpack/methods/det/dt_utils_impl.hpp b/src/mlpack/methods/det/dt_utils_impl.hpp index 4c057f76dc9..cad52891554 100644 --- a/src/mlpack/methods/det/dt_utils_impl.hpp +++ b/src/mlpack/methods/det/dt_utils_impl.hpp @@ -10,8 +10,8 @@ using namespace mlpack; using namespace det; -template -void mlpack::det::PrintLeafMembership(DTree* dtree, +template +void mlpack::det::PrintLeafMembership(DTree* dtree, const MatType& data, const arma::Mat& labels, const size_t numClasses, @@ -25,7 +25,7 @@ void mlpack::det::PrintLeafMembership(DTree* dtree, for (size_t i = 0; i < data.n_cols; i++) { - const VecType testPoint = data.unsafe_col(i); + const typename MatType::vec_type testPoint = data.unsafe_col(i); const TagType leafTag = dtree->FindBucket(testPoint); const size_t label = labels[i]; table(leafTag, label) += 1; @@ -58,8 +58,8 @@ void mlpack::det::PrintLeafMembership(DTree* dtree, return; } -template -void mlpack::det::PrintVariableImportance(const DTree* dtree, +template +void mlpack::det::PrintVariableImportance(const DTree* dtree, const std::string viFile) { arma::vec imps; @@ -97,8 +97,8 @@ void mlpack::det::PrintVariableImportance(const DTree // This function trains the optimal decision tree using the given number of // folds. -template -DTree* mlpack::det::Trainer(MatType& dataset, +template +DTree* mlpack::det::Trainer(MatType& dataset, const size_t folds, const bool useVolumeReg, const size_t maxLeafSize, @@ -106,7 +106,7 @@ DTree* mlpack::det::Trainer(MatType& dataset, const std::string unprunedTreeOutput) { // Initialize the tree. - DTree dtree(dataset); + DTree dtree(dataset); // Prepare to grow the tree... arma::Col oldFromNew(dataset.n_cols); @@ -211,7 +211,7 @@ DTree* mlpack::det::Trainer(MatType& dataset, } // Initialize the tree. - DTree cvDTree(train); + DTree cvDTree(train); // Getting ready to grow the tree... arma::Col cvOldFromNew(train.n_cols); @@ -251,7 +251,7 @@ DTree* mlpack::det::Trainer(MatType& dataset, double cvVal = 0.0; for (size_t i = 0; i < test.n_cols; ++i) { - VecType testPoint = test.unsafe_col(i); + typename MatType::vec_type testPoint = test.unsafe_col(i); cvVal += cvDTree.ComputeValue(testPoint); } @@ -283,7 +283,7 @@ DTree* mlpack::det::Trainer(MatType& dataset, Log::Info << "Optimal alpha: " << optimalAlpha << "." << std::endl; // Initialize the tree. - DTree* dtreeOpt = new DTree(dataset); + DTree* dtreeOpt = new DTree(dataset); // Getting ready to grow the tree... for (size_t i = 0; i < oldFromNew.n_elem; i++) diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index f34750fd35a..62342879b5a 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -37,7 +37,6 @@ namespace det /** Density Estimation Trees */ { * @endcode */ template class DTree { @@ -46,6 +45,7 @@ class DTree * The actual, underlying type we're working with */ typedef typename MatType::elem_type ElemType; + typedef typename MatType::vec_type VecType; /** * Create an empty density estimation tree. diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 2261bbfb436..b4560242d17 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -32,63 +32,10 @@ namespace detail ElemType splitVal; size_t splitDimension; }; - - /** - * We need that function, to be able to specialize it for sparse matrices - * in a way which is much faster then usual iteration. - */ - template - void ExtractMinMax(const MatType& data, - VecType& minVals, - VecType& maxVals) - { - // Initialize to first column; values will be overwritten if necessary. - maxVals = data.col(0); - minVals = data.col(0); - - // Loop over data to extract maximum and minimum values in each dimension. - for (size_t i = 1; i < data.n_cols; ++i) - { - for (size_t j = 0; j < data.n_rows; ++j) - { - if (data(j, i) > maxVals[j]) - maxVals[j] = data(j, i); - if (data(j, i) < minVals[j]) - minVals[j] = data(j, i); - } - } - } - - /** - * Here is the optimized specialization - */ - template - void ExtractMinMax(const arma::SpMat& data, - arma::SpCol& minVals, - arma::SpCol& maxVals) - { - // Initialize to first column; values will be overwritten if necessary. - maxVals = data.col(0); - minVals = data.col(0); - - typename arma::sp_mat::iterator dataEnd = data.end(); - - // Loop over data to extract maximum and minimum values in each dimension. - for (typename arma::sp_mat::iterator i = data.begin(); i != dataEnd; ++i) - { - size_t j = i.row(); - if (i.col() == 0) - continue; // we've already taken these values. - else if (*i > maxVals[j]) - maxVals[j] = *i; - else if (*i < minVals[j]) - minVals[j] = *i; - } - } }; -template -DTree::DTree() : +template +DTree::DTree() : start(0), end(0), splitDim(size_t(-1)), @@ -108,10 +55,10 @@ DTree::DTree() : // Root node initializers -template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, - const size_t totalPoints) : +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t totalPoints) : start(0), end(totalPoints), maxVals(maxVals), @@ -130,8 +77,8 @@ DTree::DTree(const VecType& maxVals, right(NULL) { /* Nothing to do. */ } -template -DTree::DTree(MatType & data) : +template +DTree::DTree(MatType & data) : start(0), end(data.n_cols), splitDim(size_t(-1)), @@ -146,17 +93,31 @@ DTree::DTree(MatType & data) : left(NULL), right(NULL) { - detail::ExtractMinMax(data, minVals, maxVals); + maxVals = data.col(0); + minVals = data.col(0); + + typename MatType::row_col_iterator dataEnd = data.end_row_col(); + + // Loop over data to extract maximum and minimum values in each dimension. + for (typename MatType::row_col_iterator i = data.begin_row_col(); i != dataEnd; ++i) + { + size_t j = i.row(); + if (*i > maxVals[j]) + maxVals[j] = *i; + else if (*i < minVals[j]) + minVals[j] = *i; + } + logNegError = LogNegativeError(data.n_cols); } // Non-root node initializers -template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, - const size_t start, - const size_t end, - const double logNegError) : +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t start, + const size_t end, + const double logNegError) : start(start), end(end), maxVals(maxVals), @@ -175,12 +136,12 @@ DTree::DTree(const VecType& maxVals, right(NULL) { /* Nothing to do. */ } -template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, - const size_t totalPoints, - const size_t start, - const size_t end) : +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t totalPoints, + const size_t start, + const size_t end) : start(start), end(end), maxVals(maxVals), @@ -199,8 +160,8 @@ DTree::DTree(const VecType& maxVals, right(NULL) { /* Nothing to do. */ } -template -DTree::~DTree() +template +DTree::~DTree() { delete left; delete right; @@ -208,8 +169,8 @@ DTree::~DTree() // This function computes the log-l2-negative-error of a given node from the // formula R(t) = log(|t|^2 / (N^2 V_t)). -template -double DTree::LogNegativeError(const size_t totalPoints) const +template +double DTree::LogNegativeError(const size_t totalPoints) const { // log(-|t|^2 / (N^2 V_t)) = log(-1) + 2 log(|t|) - 2 log(N) - log(V_t). double err = 2 * std::log((double) (end - start)) - @@ -230,13 +191,13 @@ double DTree::LogNegativeError(const size_t totalPoin // This function finds the best split with respect to the L2-error, by trying // all possible splits. The dataset is the full data set but the start and // end are used to obtain the point in this node. -template -bool DTree::FindSplit(const MatType& data, - size_t& splitDim, - ElemType& splitValue, - double& leftError, - double& rightError, - const size_t minLeafSize) const +template +bool DTree::FindSplit(const MatType& data, + size_t& splitDim, + ElemType& splitValue, + double& leftError, + double& rightError, + const size_t minLeafSize) const { // Ensure the dimensionality of the data is the same as the dimensionality of // the bounding rectangle. @@ -280,7 +241,7 @@ bool DTree::FindSplit(const MatType& data, double volumeWithoutDim = logVolume - std::log(max - min); // Get the values for the dimension. - VecType dimVec = data.row(dim).subvec(start, end - 1); + typename MatType::row_type dimVec = data.row(dim).subvec(start, end - 1); // Sort the values in ascending order. dimVec = arma::sort(dimVec); @@ -347,11 +308,11 @@ bool DTree::FindSplit(const MatType& data, return splitFound; } -template -size_t DTree::SplitData(MatType& data, - const size_t splitDim, - const ElemType splitValue, - arma::Col& oldFromNew) const +template +size_t DTree::SplitData(MatType& data, + const size_t splitDim, + const ElemType splitValue, + arma::Col& oldFromNew) const { // Swap all columns such that any columns with value in dimension splitDim // less than or equal to splitValue are on the left side, and all others are @@ -382,12 +343,12 @@ size_t DTree::SplitData(MatType& data, } // Greedily expand the tree -template -double DTree::Grow(MatType& data, - arma::Col& oldFromNew, - const bool useVolReg, - const size_t maxLeafSize, - const size_t minLeafSize) +template +double DTree::Grow(MatType& data, + arma::Col& oldFromNew, + const bool useVolReg, + const size_t maxLeafSize, + const size_t minLeafSize) { Log::Assert(data.n_rows == maxVals.n_elem); Log::Assert(data.n_rows == minVals.n_elem); @@ -527,10 +488,10 @@ double DTree::Grow(MatType& data, } -template -double DTree::PruneAndUpdate(const double oldAlpha, - const size_t points, - const bool useVolReg) +template +double DTree::PruneAndUpdate(const double oldAlpha, + const size_t points, + const bool useVolReg) { // Compute gT. if (subtreeLeaves == 1) // If we are a leaf... @@ -642,8 +603,8 @@ double DTree::PruneAndUpdate(const double oldAlpha, // // Future improvement: Open up the range with epsilons on both sides where // epsilon depends on the density near the boundary. -template -bool DTree::WithinRange(const VecType& query) const +template +bool DTree::WithinRange(const VecType& query) const { for (size_t i = 0; i < query.n_elem; ++i) if ((query[i] < minVals[i]) || (query[i] > maxVals[i])) @@ -653,8 +614,8 @@ bool DTree::WithinRange(const VecType& query) const } -template -double DTree::ComputeValue(const VecType& query) const +template +double DTree::ComputeValue(const VecType& query) const { Log::Assert(query.n_elem == maxVals.n_elem); @@ -680,8 +641,8 @@ double DTree::ComputeValue(const VecType& query) cons // Index the buckets for possible usage later. -template -TagType DTree::TagTree(const TagType& tag) +template +TagType DTree::TagTree(const TagType& tag) { if (subtreeLeaves == 1) { @@ -696,8 +657,8 @@ TagType DTree::TagTree(const TagType& tag) } -template -TagType DTree::FindBucket(const VecType& query) const +template +TagType DTree::FindBucket(const VecType& query) const { Log::Assert(query.n_elem == maxVals.n_elem); @@ -712,8 +673,8 @@ TagType DTree::FindBucket(const VecType& query) const } } -template -void DTree::ComputeVariableImportance(arma::vec& importances) const +template +void DTree::ComputeVariableImportance(arma::vec& importances) const { // Clear and set to right size. importances.zeros(maxVals.n_elem); @@ -740,9 +701,9 @@ void DTree::ComputeVariableImportance(arma::vec& impo } } -template +template template -void DTree::Serialize(Archive& ar, const unsigned int /* version */) +void DTree::Serialize(Archive& ar, const unsigned int /* version */) { using data::CreateNVP; diff --git a/src/mlpack/tests/det_test.cpp b/src/mlpack/tests/det_test.cpp index 2b0ef3788a0..33659842f19 100644 --- a/src/mlpack/tests/det_test.cpp +++ b/src/mlpack/tests/det_test.cpp @@ -42,7 +42,7 @@ BOOST_AUTO_TEST_CASE(TestGetMaxMinVals) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree tree(testData); + DTree tree(testData); BOOST_REQUIRE_EQUAL(tree.maxVals[0], 7); BOOST_REQUIRE_EQUAL(tree.minVals[0], 3); @@ -57,7 +57,7 @@ BOOST_AUTO_TEST_CASE(TestComputeNodeError) arma::vec maxVals("7 7 8"); arma::vec minVals("3 0 1"); - DTree testDTree(maxVals, minVals, 5); + DTree testDTree(maxVals, minVals, 5); double trueNodeError = -log(4.0) - log(7.0) - log(7.0); BOOST_REQUIRE_CLOSE((double) testDTree.logNegError, trueNodeError, 1e-10); @@ -75,7 +75,7 @@ BOOST_AUTO_TEST_CASE(TestWithinRange) arma::vec maxVals("7 7 8"); arma::vec minVals("3 0 1"); - DTree testDTree(maxVals, minVals, 5); + DTree testDTree(maxVals, minVals, 5); arma::vec testQuery(3); testQuery << 4.5 << 2.5 << 2; @@ -95,7 +95,7 @@ BOOST_AUTO_TEST_CASE(TestFindSplit) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree testDTree(testData); + DTree testDTree(testData); size_t obDim, trueDim; double trueLeftError, obLeftError, trueRightError, obRightError, obSplit, trueSplit; @@ -123,7 +123,7 @@ BOOST_AUTO_TEST_CASE(TestSplitData) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree testDTree(testData); + DTree testDTree(testData); arma::Col oTest(5); oTest << 1 << 2 << 3 << 4 << 5; @@ -166,7 +166,7 @@ BOOST_AUTO_TEST_CASE(TestGrow) rlError = 2 * log(1.0 / 5.0) - (log(0.5) + log(4.0) + log(2.5)); rrError = 2 * log(2.0 / 5.0) - (log(6.5) + log(4.0) + log(2.5)); - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); BOOST_REQUIRE_EQUAL(oTest[0], 0); @@ -219,7 +219,7 @@ BOOST_AUTO_TEST_CASE(TestPruneAndUpdate) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false); @@ -252,7 +252,7 @@ BOOST_AUTO_TEST_CASE(TestComputeValue) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); double d1 = (2.0 / 5.0) / exp(log(4.0) + log(7.0) + log(4.5)); @@ -295,7 +295,7 @@ BOOST_AUTO_TEST_CASE(TestVariableImportance) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); testDTree.Grow(testData, oTest, false, 2, 1); arma::vec imps; diff --git a/src/mlpack/tests/serialization_test.cpp b/src/mlpack/tests/serialization_test.cpp index d7ea76e74fb..64769df90b0 100644 --- a/src/mlpack/tests/serialization_test.cpp +++ b/src/mlpack/tests/serialization_test.cpp @@ -853,7 +853,7 @@ BOOST_AUTO_TEST_CASE(SoftmaxRegressionTest) BOOST_AUTO_TEST_CASE(DETTest) { using det::DTree; - typedef DTree DTreeX; + typedef DTree DTreeX; // Create a density estimation tree on a random dataset. arma::mat dataset = arma::randu(25, 5000); From e4a9be06c6733590d896bdbdb1b63d30b67ef4f7 Mon Sep 17 00:00:00 2001 From: theJonan Date: Mon, 17 Oct 2016 14:30:48 +0300 Subject: [PATCH 06/25] - More sparse-matrix migration steps. --- src/mlpack/methods/det/dtree.hpp | 4 +- src/mlpack/methods/det/dtree_impl.hpp | 106 ++++++++++++++++++++++---- 2 files changed, 95 insertions(+), 15 deletions(-) diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index 62342879b5a..9a2b1751f87 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -199,9 +199,9 @@ class DTree size_t end; //! Upper half of bounding box for this node. - arma::vec maxVals; + VecType maxVals; //! Lower half of bounding box for this node. - arma::vec minVals; + VecType minVals; //! The splitting dimension for this node. size_t splitDim; diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index b4560242d17..b683ffddf7a 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -8,6 +8,8 @@ */ #include "dtree.hpp" #include +#include +#include #include using namespace mlpack; @@ -32,6 +34,68 @@ namespace detail ElemType splitVal; size_t splitDimension; }; + + /** + * Get the values for the dimension and sort them. The old implementation: + * dimVec = data.row(dim).subvec(start, end - 1); + * dimVec = arma::sort(dimVec); + * was quite inefficient, due to many (3) vector copy operations. This could be a + * problem especially for sparse matrices. That's why they have custom implementation. + */ + template + typename MatType::row_type ExtractSortedRow(const MatType& data, + size_t dim, + size_t start, + size_t end) + { + typedef typename MatType::elem_type ElemType; + + assert(start < end); + + typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1)); + std::sort(dimVec.begin(), dimVec.end()); + return dimVec; + } + +// template +// std::vector > ExtractSortedRow(const arma::SpMat& data, +// size_t dim, +// size_t start, +// size_t end, +// size_t padding) +// { +// typedef SortedItem SortedType; +// +// assert(padding > 0); +// assert(start < end); +// +// arma::SpRow dimVec = data(dim, arma::span(start, end - 1)); +// typename arma::SpRow::iterator dimVecEnd = dimVec.end(); +// +// // Build the vector to be sorted with values in ascending order. +// std::vector sortedDim = std::vector(); +// sortedDim.reserve(dimVec.n_elem / 2); +// +// // Prepare these for the iteration. +// ElemType lastVal = 0; +// --padding; +// +// // Iterate over the row and put only different values, also skipping +// // `padding` number of elements from both sides of the row. +// for (typename MatType::row_col_iterator di = dimVec.begin_row_col(); di != dimVecEnd; ++di) +// { +// if (di.col() < padding || di.col() >= dimVec.n_elem - padding) +// continue; +// // if (*di == lastVal && sortedDim.size() > 0) +// // continue; +// +// sortedDim.push_back(lastVal = *di); +// } +// +// std::sort(sortedDim.begin(), sortedDim.end()); +// return sortedDim; +// } + }; template @@ -239,22 +303,38 @@ bool DTree::FindSplit(const MatType& data, // Find the log volume of all the other dimensions. double volumeWithoutDim = logVolume - std::log(max - min); - - // Get the values for the dimension. - typename MatType::row_type dimVec = data.row(dim).subvec(start, end - 1); - - // Sort the values in ascending order. - dimVec = arma::sort(dimVec); + + // Get a sorted version of the dimension in interest, + // from the given samples range. This is the most expensive step. + typename MatType::row_type dimVec = detail::ExtractSortedRow(data, + dim, + start, + end); + + typename MatType::row_col_iterator dimVecEnd = dimVec.end_row_col(); + typename MatType::row_col_iterator dI = dimVec.begin_row_col(); // Find the best split for this dimension. We need to figure out why // there are spikes if this minLeafSize is enforced here... - for (size_t i = minLeafSize - 1; i < dimVec.n_elem - minLeafSize; ++i) + for (;;) { + const size_t position = dI.col(); + + if (position >= dimVec.n_cols - minLeafSize) + break; + if (position < minLeafSize - 1) + continue; + + ElemType split = *dI; + if (++dI == dimVecEnd) + break; // This means we have same values till the end => No split. + // This makes sense for real continuous data. This kinda corrupts the // data and estimation if the data is ordinal. - const ElemType split = (dimVec[i] + dimVec[i + 1]) / 2.0; - - if (split == dimVec[i]) + split += *dI; + split /= 2.0; + + if (split == *dI) continue; // We can't split here (two points are the same). // Another way of picking split is using this: @@ -263,7 +343,7 @@ bool DTree::FindSplit(const MatType& data, { // Ensure that the right node will have at least the minimum number of // points. - Log::Assert((points - i - 1) >= minLeafSize); + Log::Assert((points - position - 1) >= minLeafSize); // Now we have to see if the error will be reduced. Simple manipulation // of the error function gives us the condition we must satisfy: @@ -271,8 +351,8 @@ bool DTree::FindSplit(const MatType& data, // and because the volume is only dependent on the dimension we are // splitting, we can assume V_l is just the range of the left and V_r is // just the range of the right. - double negLeftError = std::pow(i + 1, 2.0) / (split - min); - double negRightError = std::pow(points - i - 1, 2.0) / (max - split); + double negLeftError = std::pow(position + 1, 2.0) / (split - min); + double negRightError = std::pow(points - position - 1, 2.0) / (max - split); // If this is better, take it. if ((negLeftError + negRightError) >= minDimError) From 3547fffc27f1f0a38b7813e902c4143de6b8a5f8 Mon Sep 17 00:00:00 2001 From: theJonan Date: Mon, 17 Oct 2016 17:03:18 +0300 Subject: [PATCH 07/25] - First successfull SpMat build. --- src/mlpack/methods/det/dtree_impl.hpp | 104 +++++++++++++++----------- 1 file changed, 62 insertions(+), 42 deletions(-) diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index b683ffddf7a..16092b8034e 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -8,13 +8,52 @@ */ #include "dtree.hpp" #include -#include #include +#include #include using namespace mlpack; using namespace det; +namespace arma +{ + template + SpMat sort(const SpMat& data) + { + typedef std::vector ElemVectorType; + + // Construct the vector of values / locations... + ElemVectorType indexVec(data.begin(), data.end()); + + // ... and sort it! + std::sort(indexVec.begin(), indexVec.end()); + + // Now prepare the structures for the batch construction of the + // sorted sparse row. + arma::umat locations(2, data.n_nonzero); + arma::Col vals(data.n_nonzero); + ElemType lastVal = -std::numeric_limits::max(); + size_t padding = 0; + + for (size_t ii = 0; ii < indexVec.size(); ++ii) + { + const ElemType newVal = indexVec[ii]; + if (lastVal < ElemType(0) && newVal > ElemType(0)) + { + assert(padding == 0); // we should arrive here once! + padding = data.n_elem - data.n_nonzero; + } + + locations.at(0, ii) = (ii + padding) % data.n_rows; + locations.at(1, ii) = (ii + padding) / data.n_rows; + vals.at(ii) = lastVal = newVal; + } + + return SpMat(locations, vals, data.n_rows, data.n_cols, false, false); + }; + +}; + namespace detail { template @@ -57,44 +96,21 @@ namespace detail return dimVec; } -// template -// std::vector > ExtractSortedRow(const arma::SpMat& data, -// size_t dim, -// size_t start, -// size_t end, -// size_t padding) -// { -// typedef SortedItem SortedType; -// -// assert(padding > 0); -// assert(start < end); -// -// arma::SpRow dimVec = data(dim, arma::span(start, end - 1)); -// typename arma::SpRow::iterator dimVecEnd = dimVec.end(); -// -// // Build the vector to be sorted with values in ascending order. -// std::vector sortedDim = std::vector(); -// sortedDim.reserve(dimVec.n_elem / 2); -// -// // Prepare these for the iteration. -// ElemType lastVal = 0; -// --padding; -// -// // Iterate over the row and put only different values, also skipping -// // `padding` number of elements from both sides of the row. -// for (typename MatType::row_col_iterator di = dimVec.begin_row_col(); di != dimVecEnd; ++di) -// { -// if (di.col() < padding || di.col() >= dimVec.n_elem - padding) -// continue; -// // if (*di == lastVal && sortedDim.size() > 0) -// // continue; -// -// sortedDim.push_back(lastVal = *di); -// } -// -// std::sort(sortedDim.begin(), sortedDim.end()); -// return sortedDim; -// } + /** + * We need a custom implementation for sparse matrix, in order to save sorting of + * all these zeroes. + */ + template + arma::SpRow ExtractSortedRow(const arma::SpMat& data, + size_t dim, + size_t start, + size_t end) + { + assert(start < end); + + arma::SpRow dimVec = data(dim, arma::span(start, end - 1)); + return arma::sort(dimVec); + } }; @@ -314,20 +330,24 @@ bool DTree::FindSplit(const MatType& data, typename MatType::row_col_iterator dimVecEnd = dimVec.end_row_col(); typename MatType::row_col_iterator dI = dimVec.begin_row_col(); - // Find the best split for this dimension. We need to figure out why - // there are spikes if this minLeafSize is enforced here... + // Find the best split for this dimension. for (;;) { const size_t position = dI.col(); + // Ensure the minimum leaf size on both sides. We need to figure out why + // there are spikes if this minLeafSize is enforced here... if (position >= dimVec.n_cols - minLeafSize) break; if (position < minLeafSize - 1) continue; ElemType split = *dI; + + // In case of sparse matrices and all present values being negative + // this can happen - last many elements being 0, so we need to check. if (++dI == dimVecEnd) - break; // This means we have same values till the end => No split. + break; // This makes sense for real continuous data. This kinda corrupts the // data and estimation if the data is ordinal. From 11c4b0ae7c7f0bb037432c80244d9bed45e55ffd Mon Sep 17 00:00:00 2001 From: theJonan Date: Mon, 17 Oct 2016 22:55:41 +0300 Subject: [PATCH 08/25] - Sparse DTree finally working. --- src/mlpack/methods/det/dtree.hpp | 25 +-- src/mlpack/methods/det/dtree_impl.hpp | 256 +++++++++++--------------- 2 files changed, 117 insertions(+), 164 deletions(-) diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index 9a2b1751f87..70c8cc5215c 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -44,8 +44,9 @@ class DTree /** * The actual, underlying type we're working with */ - typedef typename MatType::elem_type ElemType; - typedef typename MatType::vec_type VecType; + typedef typename MatType::elem_type ElemType; + typedef typename MatType::vec_type VecType; + typedef typename arma::Col StatType; /** * Create an empty density estimation tree. @@ -60,8 +61,8 @@ class DTree * @param minVals Minimum values of the bounding box. * @param totalPoints Total number of points in the dataset. */ - DTree(const VecType& maxVals, - const VecType& minVals, + DTree(const StatType& maxVals, + const StatType& minVals, const size_t totalPoints); /** @@ -86,8 +87,8 @@ class DTree * @param end End of points represented by this node in the data matrix. * @param error log-negative error of this node. */ - DTree(const VecType& maxVals, - const VecType& minVals, + DTree(const StatType& maxVals, + const StatType& minVals, const size_t start, const size_t end, const double logNegError); @@ -103,8 +104,8 @@ class DTree * @param start Start of points represented by this node in the data matrix. * @param end End of points represented by this node in the data matrix. */ - DTree(const VecType& maxVals, - const VecType& minVals, + DTree(const StatType& maxVals, + const StatType& minVals, const size_t totalPoints, const size_t start, const size_t end); @@ -199,9 +200,9 @@ class DTree size_t end; //! Upper half of bounding box for this node. - VecType maxVals; + StatType maxVals; //! Lower half of bounding box for this node. - VecType minVals; + StatType minVals; //! The splitting dimension for this node. size_t splitDim; @@ -270,10 +271,10 @@ class DTree TagType BucketTag() const { return subtreeLeaves == 1 ? bucketTag : -1; } //! Return the maximum values. - const VecType& MaxVals() const { return maxVals; } + const StatType& MaxVals() const { return maxVals; } //! Return the minimum values. - const VecType& MinVals() const { return minVals; } + const StatType& MinVals() const { return minVals; } /** * Serialize the density estimation tree. diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 16092b8034e..ad54039274a 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -5,113 +5,98 @@ * Implementations of some declared functions in * the Density Estimation Tree class. * + * Sparsification and optimizations. + * @author Ivan Georgiev (ivan@jonan.info) + * */ #include "dtree.hpp" #include -#include #include -#include using namespace mlpack; using namespace det; -namespace arma +namespace details { - template - SpMat sort(const SpMat& data) + /** + * This one sorts and scand the given per-dimension extract and puts all splits + * in a vector, that can easily be iterated afterwards. + */ + template + std::vector> ExtractSplits(RowType& row, size_t minLeafSize) { - typedef std::vector ElemVectorType; + typedef std::pair SplitItem; + std::vector splitVec; - // Construct the vector of values / locations... - ElemVectorType indexVec(data.begin(), data.end()); - - // ... and sort it! - std::sort(indexVec.begin(), indexVec.end()); - - // Now prepare the structures for the batch construction of the - // sorted sparse row. - arma::umat locations(2, data.n_nonzero); - arma::Col vals(data.n_nonzero); - ElemType lastVal = -std::numeric_limits::max(); - size_t padding = 0; + // We want to do that inplace for optimization purposes; + std::sort(row.begin(), row.end()); - for (size_t ii = 0; ii < indexVec.size(); ++ii) + // Ensure the minimum leaf size on both sides. We need to figure out why + // there are spikes if this minLeafSize is enforced here... + for (size_t i = minLeafSize - 1; i < row.n_elem - minLeafSize; ++i) { - const ElemType newVal = indexVec[ii]; - if (lastVal < ElemType(0) && newVal > ElemType(0)) - { - assert(padding == 0); // we should arrive here once! - padding = data.n_elem - data.n_nonzero; - } + // This makes sense for real continuous data. This kinda corrupts the + // data and estimation if the data is ordinal. + const ElemType split = (row[i] + row[i + 1]) / 2.0; - locations.at(0, ii) = (ii + padding) % data.n_rows; - locations.at(1, ii) = (ii + padding) / data.n_rows; - vals.at(ii) = lastVal = newVal; + // Check if we can split here (two points are different) + if (split != row[i]) + splitVec.push_back(SplitItem(split, i)); } - return SpMat(locations, vals, data.n_rows, data.n_cols, false, false); - }; + return splitVec; + + } -}; - -namespace detail -{ + // This the custom, sparse optimized implementation of the same routine. template - class DTreeSplit + std::vector> ExtractSplits(arma::SpRow& row, size_t minLeafSize) { - public: - typedef DTreeSplit SplitInfo; - - template - static bool AssignToLeftNode(const VecType& point, - const SplitInfo& splitInfo) - { - return point[splitInfo.splitDimension] < splitInfo.splitVal; - } + typedef std::pair SplitItem; + std::vector splitVec; - private: - ElemType splitVal; - size_t splitDimension; - }; - - /** - * Get the values for the dimension and sort them. The old implementation: - * dimVec = data.row(dim).subvec(start, end - 1); - * dimVec = arma::sort(dimVec); - * was quite inefficient, due to many (3) vector copy operations. This could be a - * problem especially for sparse matrices. That's why they have custom implementation. - */ - template - typename MatType::row_type ExtractSortedRow(const MatType& data, - size_t dim, - size_t start, - size_t end) - { - typedef typename MatType::elem_type ElemType; + // Construct a vector of values. + std::vector valsVec(row.begin(), row.end()); - assert(start < end); + // ... and sort it! + std::sort(valsVec.begin(), valsVec.end()); - typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1)); - std::sort(dimVec.begin(), dimVec.end()); - return dimVec; - } - - /** - * We need a custom implementation for sparse matrix, in order to save sorting of - * all these zeroes. - */ - template - arma::SpRow ExtractSortedRow(const arma::SpMat& data, - size_t dim, - size_t start, - size_t end) - { - assert(start < end); + // Now iterate over the values, taking account for the over-the-zeroes + // jump and construct the splits vector. + ElemType lastVal = -std::numeric_limits::max(); + size_t padding = 0, zeroes = row.n_elem - row.n_nonzero; + + for (size_t i = 0; i < valsVec.size(); ++i) + { + const ElemType newVal = valsVec[i]; + if (lastVal < ElemType(0) && newVal > ElemType(0) && zeroes > 0) + { + Log::Assert(padding == 0); // we should arrive here once! + if (lastVal >= valsVec[0] && // i.e. we're not in the beginning + i >= minLeafSize && + i <= row.n_elem - minLeafSize) + splitVec.push_back(SplitItem(lastVal / 2.0, i - 1)); + + padding = zeroes; + lastVal = ElemType(0); + } + + if (i + padding >= minLeafSize && i + padding <= row.n_elem - minLeafSize)// the normal case + { + // This makes sense for real continuous data. This kinda corrupts the + // data and estimation if the data is ordinal. + const ElemType split = (lastVal + newVal) / 2.0; + + // Check if we can split here (two points are different) + if (split != newVal) + splitVec.push_back(SplitItem(split, i + padding - 1)); + } + + lastVal = newVal; + } - arma::SpRow dimVec = data(dim, arma::span(start, end - 1)); - return arma::sort(dimVec); + return splitVec; } - }; template @@ -136,8 +121,8 @@ DTree::DTree() : // Root node initializers template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, +DTree::DTree(const StatType& maxVals, + const StatType& minVals, const size_t totalPoints) : start(0), end(totalPoints), @@ -161,6 +146,8 @@ template DTree::DTree(MatType & data) : start(0), end(data.n_cols), + minVals(arma::min(data, 1)), + maxVals(arma::max(data, 1)), splitDim(size_t(-1)), splitValue(std::numeric_limits::max()), subtreeLeavesLogNegError(-DBL_MAX), @@ -173,28 +160,13 @@ DTree::DTree(MatType & data) : left(NULL), right(NULL) { - maxVals = data.col(0); - minVals = data.col(0); - - typename MatType::row_col_iterator dataEnd = data.end_row_col(); - - // Loop over data to extract maximum and minimum values in each dimension. - for (typename MatType::row_col_iterator i = data.begin_row_col(); i != dataEnd; ++i) - { - size_t j = i.row(); - if (*i > maxVals[j]) - maxVals[j] = *i; - else if (*i < minVals[j]) - minVals[j] = *i; - } - logNegError = LogNegativeError(data.n_cols); } // Non-root node initializers template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, +DTree::DTree(const StatType& maxVals, + const StatType& minVals, const size_t start, const size_t end, const double logNegError) : @@ -217,8 +189,8 @@ DTree::DTree(const VecType& maxVals, { /* Nothing to do. */ } template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, +DTree::DTree(const StatType& maxVals, + const StatType& minVals, const size_t totalPoints, const size_t start, const size_t end) : @@ -256,13 +228,12 @@ double DTree::LogNegativeError(const size_t totalPoints) const double err = 2 * std::log((double) (end - start)) - 2 * std::log((double) totalPoints); - VecType valDiffs = maxVals - minVals; - typename VecType::iterator valEnd = valDiffs.end(); - for (typename VecType::iterator i = valDiffs.begin(); i != valEnd; ++i) + StatType valDiffs = maxVals - minVals; + for (size_t i = 0;i < valDiffs.n_elem; ++i) { // Ignore very small dimensions to prevent overflow. - if (*i > 1e-50) - err -= std::log(*i); + if (valDiffs[i] > 1e-50) + err -= std::log(valDiffs[i]); } return err; @@ -279,10 +250,12 @@ bool DTree::FindSplit(const MatType& data, double& rightError, const size_t minLeafSize) const { + typedef std::pair SplitItem; + // Ensure the dimensionality of the data is the same as the dimensionality of // the bounding rectangle. - assert(data.n_rows == maxVals.n_elem); - assert(data.n_rows == minVals.n_elem); + Log::Assert(data.n_rows == maxVals.n_elem); + Log::Assert(data.n_rows == minVals.n_elem); const size_t points = end - start; @@ -319,44 +292,24 @@ bool DTree::FindSplit(const MatType& data, // Find the log volume of all the other dimensions. double volumeWithoutDim = logVolume - std::log(max - min); - - // Get a sorted version of the dimension in interest, - // from the given samples range. This is the most expensive step. - typename MatType::row_type dimVec = detail::ExtractSortedRow(data, - dim, - start, - end); - - typename MatType::row_col_iterator dimVecEnd = dimVec.end_row_col(); - typename MatType::row_col_iterator dI = dimVec.begin_row_col(); - // Find the best split for this dimension. - for (;;) - { - const size_t position = dI.col(); - - // Ensure the minimum leaf size on both sides. We need to figure out why - // there are spikes if this minLeafSize is enforced here... - if (position >= dimVec.n_cols - minLeafSize) - break; - if (position < minLeafSize - 1) - continue; - - ElemType split = *dI; + // Get the values for splitting. The old implementation: + // dimVec = data.row(dim).subvec(start, end - 1); + // dimVec = arma::sort(dimVec); + // could be quite inefficient for sparse matrices, due to copy operations (3). + // This one has custom implementation for dense and sparse matrices. - // In case of sparse matrices and all present values being negative - // this can happen - last many elements being 0, so we need to check. - if (++dI == dimVecEnd) - break; - - // This makes sense for real continuous data. This kinda corrupts the - // data and estimation if the data is ordinal. - split += *dI; - split /= 2.0; + typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1)); + std::vector splitVec = details::ExtractSplits(dimVec, minLeafSize); + + // Iterate on all the splits for this dimension + for (typename std::vector::iterator i = splitVec.begin(); + i != splitVec.end(); + ++i) + { + const ElemType split = i->first; + const size_t position = i->second; - if (split == *dI) - continue; // We can't split here (two points are the same). - // Another way of picking split is using this: // split = leftsplit; if ((split - min > 0.0) && (max - split > 0.0)) @@ -475,14 +428,13 @@ double DTree::Grow(MatType& data, { // Move the data around for the children to have points in a node lie // contiguously (to increase efficiency during the training). -// const size_t splitIndex = splt::PerformSplit(data, start, end - start, ) const size_t splitIndex = SplitData(data, dim, splitValueTmp, oldFromNew); // Make max and min vals for the children. - arma::vec maxValsL(maxVals); - arma::vec maxValsR(maxVals); - arma::vec minValsL(minVals); - arma::vec minValsR(minVals); + StatType maxValsL(maxVals); + StatType maxValsR(maxVals); + StatType minValsL(minVals); + StatType minValsR(minVals); maxValsL[dim] = splitValueTmp; minValsR[dim] = splitValueTmp; @@ -524,7 +476,7 @@ double DTree::Grow(MatType& data, else { // We can make this a leaf node. - assert((size_t) (end - start) >= minLeafSize); + Log::Assert((size_t) (end - start) >= minLeafSize); subtreeLeaves = 1; subtreeLeavesLogNegError = logNegError; } From 4083784eb8b7ac54fe182aa4565f7f2f25e19cde Mon Sep 17 00:00:00 2001 From: theJonan Date: Tue, 18 Oct 2016 11:54:50 +0300 Subject: [PATCH 09/25] - All DET sparsification works. --- src/mlpack/methods/det/dt_utils_impl.hpp | 29 +++-- src/mlpack/methods/det/dtree.hpp | 2 +- src/mlpack/methods/det/dtree_impl.hpp | 9 +- src/mlpack/tests/det_test.cpp | 137 +++++++++++++++++++++++ 4 files changed, 156 insertions(+), 21 deletions(-) diff --git a/src/mlpack/methods/det/dt_utils_impl.hpp b/src/mlpack/methods/det/dt_utils_impl.hpp index cad52891554..fcb6b0cbbd4 100644 --- a/src/mlpack/methods/det/dt_utils_impl.hpp +++ b/src/mlpack/methods/det/dt_utils_impl.hpp @@ -99,11 +99,11 @@ void mlpack::det::PrintVariableImportance(const DTree* dtree, // folds. template DTree* mlpack::det::Trainer(MatType& dataset, - const size_t folds, - const bool useVolumeReg, - const size_t maxLeafSize, - const size_t minLeafSize, - const std::string unprunedTreeOutput) + const size_t folds, + const bool useVolumeReg, + const size_t maxLeafSize, + const size_t minLeafSize, + const std::string unprunedTreeOutput) { // Initialize the tree. DTree dtree(dataset); @@ -170,7 +170,7 @@ DTree* mlpack::det::Trainer(MatType& dataset, << " " << oldAlpha << "." << std::endl; MatType cvData(dataset); - size_t testSize = dataset.n_cols / folds; + const size_t testSize = dataset.n_cols / folds; arma::vec regularizationConstants(prunedSequence.size()); regularizationConstants.fill(0.0); @@ -181,17 +181,17 @@ DTree* mlpack::det::Trainer(MatType& dataset, // implementation. #ifdef _WIN32 #pragma omp parallel for default(none) \ - shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) + shared(testSize, cvData, prunedSequence, regularizationConstants) for (intmax_t fold = 0; fold < (intmax_t) folds; fold++) #else #pragma omp parallel for default(none) \ - shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) + shared(testSize, cvData, prunedSequence, regularizationConstants) for (size_t fold = 0; fold < folds; fold++) #endif { // Break up data into train and test sets. - size_t start = fold * testSize; - size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols); + const size_t start = fold * testSize; + const size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols); MatType test = cvData.cols(start, end - 1); MatType train(cvData.n_rows, cvData.n_cols - test.n_cols); @@ -239,11 +239,10 @@ DTree* mlpack::det::Trainer(MatType& dataset, } // Update the cv regularization constant. - cvRegularizationConstants[i] += 2.0 * cvVal / (double) dataset.n_cols; + cvRegularizationConstants[i] += 2.0 * cvVal / (double) cvData.n_cols; // Determine the new alpha value and prune accordingly. - double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first + - prunedSequence[i + 2].first); + double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first + prunedSequence[i + 2].first); cvDTree.PruneAndUpdate(cvOldAlpha, train.n_cols, useVolumeReg); } @@ -256,9 +255,9 @@ DTree* mlpack::det::Trainer(MatType& dataset, } if (prunedSequence.size() > 2) - cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) dataset.n_cols; + cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) cvData.n_cols; - #pragma omp critical + #pragma omp critical (DTreeCVUpdate) regularizationConstants += cvRegularizationConstants; } Timer::Stop("cross_validation"); diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index 70c8cc5215c..a4ec63a87dd 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -46,7 +46,7 @@ class DTree */ typedef typename MatType::elem_type ElemType; typedef typename MatType::vec_type VecType; - typedef typename arma::Col StatType; + typedef typename arma::Col StatType; /** * Create an empty density estimation tree. diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index ad54039274a..be64f5c7fda 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -146,8 +146,8 @@ template DTree::DTree(MatType & data) : start(0), end(data.n_cols), - minVals(arma::min(data, 1)), maxVals(arma::max(data, 1)), + minVals(arma::min(data, 1)), splitDim(size_t(-1)), splitValue(std::numeric_limits::max()), subtreeLeavesLogNegError(-DBL_MAX), @@ -264,11 +264,11 @@ bool DTree::FindSplit(const MatType& data, // Loop through each dimension. #ifdef _WIN32 - #pragma omp parallel for default(shared) \ + #pragma omp parallel for default(none) \ shared(splitValue, splitDim, data) for (intmax_t dim = 0; dim < (intmax_t) maxVals.n_elem; ++dim) #else - #pragma omp parallel for default(shared) \ + #pragma omp parallel for default(none) \ shared(splitValue, splitDim, data) for (size_t dim = 0; dim < maxVals.n_elem; ++dim) #endif @@ -341,10 +341,9 @@ bool DTree::FindSplit(const MatType& data, double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; -#pragma omp atomic +#pragma omp critical (DTreeFindUpdate) if ((actualMinDimError > minError) && dimSplitFound) { -#pragma omp critical DTreeFindUpdate { // Calculate actual error (in logspace) by adding terms back to our // estimate. diff --git a/src/mlpack/tests/det_test.cpp b/src/mlpack/tests/det_test.cpp index 33659842f19..09b96918ca6 100644 --- a/src/mlpack/tests/det_test.cpp +++ b/src/mlpack/tests/det_test.cpp @@ -141,6 +141,70 @@ BOOST_AUTO_TEST_CASE(TestSplitData) BOOST_REQUIRE_EQUAL(oTest[3], 2); BOOST_REQUIRE_EQUAL(oTest[4], 5); } + +BOOST_AUTO_TEST_CASE(TestSparseFindSplit) +{ + arma::mat realData(4,7); + + realData << .0 << 4 << 5 << 7 << 0 << 5 << 0 << arma::endr + << .0 << 5 << 0 << 0 << 1 << 7 << 1 << arma::endr + << .0 << 5 << 6 << 7 << 1 << 0 << 8 << arma::endr + << -1 << 2 << 5 << 0 << 0 << 0 << 0 << arma::endr; + + arma::sp_mat testData(realData); + + DTree testDTree(testData); + + size_t obDim, trueDim; + double trueLeftError, obLeftError, trueRightError, obRightError, obSplit, trueSplit; + + trueDim = 1; + trueSplit = .5; + trueLeftError = 2 * log(3.0 / 7.0) - (log(7.0) + log(0.5) + log(8.0) + log(6.0)); + trueRightError = 2 * log(4.0 / 7.0) - (log(7.0) + log(6.5) + log(8.0) + log(6.0)); + + testDTree.logVolume = log(7.0) + log(7.0) + log(8.0) + log(6.0); + BOOST_REQUIRE(testDTree.FindSplit(testData, obDim, obSplit, obLeftError, obRightError, 1)); + + BOOST_REQUIRE(trueDim == obDim); + BOOST_REQUIRE_CLOSE(trueSplit, obSplit, 1e-10); + + BOOST_REQUIRE_CLOSE(trueLeftError, obLeftError, 1e-10); + BOOST_REQUIRE_CLOSE(trueRightError, obRightError, 1e-10); +} + +BOOST_AUTO_TEST_CASE(TestSparseSplitData) +{ + arma::mat realData(4,7); + + realData << .0 << 4 << 5 << 7 << 0 << 5 << 0 << arma::endr + << .0 << 5 << 0 << 0 << 1 << 7 << 1 << arma::endr + << .0 << 5 << 6 << 7 << 1 << 0 << 8 << arma::endr + << -1 << 2 << 5 << 0 << 0 << 0 << 0 << arma::endr; + + arma::sp_mat testData(realData); + + DTree testDTree(testData); + + arma::Col oTest(7); + oTest << 1 << 2 << 3 << 4 << 5 << 6 << 7; + + size_t splitDim = 1; + double trueSplitVal = .5; + + size_t splitInd = testDTree.SplitData(testData, splitDim, trueSplitVal, oTest); + + BOOST_REQUIRE_EQUAL(splitInd, 3); // 2 points on left side. + + BOOST_REQUIRE_EQUAL(oTest[0], 1); + BOOST_REQUIRE_EQUAL(oTest[1], 4); + BOOST_REQUIRE_EQUAL(oTest[2], 3); + BOOST_REQUIRE_EQUAL(oTest[3], 2); + BOOST_REQUIRE_EQUAL(oTest[4], 5); + BOOST_REQUIRE_EQUAL(oTest[5], 6); + BOOST_REQUIRE_EQUAL(oTest[6], 7); +} + #endif // Tests for the public functions. @@ -307,6 +371,79 @@ BOOST_AUTO_TEST_CASE(TestVariableImportance) BOOST_REQUIRE_CLOSE((double) (rootError - (lError + rError)), imps[2], 1e-10); } +BOOST_AUTO_TEST_CASE(TestSparsePruneAndUpdate) +{ + arma::mat realData(3, 5); + + realData << 4 << 5 << 7 << 3 << 5 << arma::endr + << 5 << 0 << 1 << 7 << 1 << arma::endr + << 5 << 6 << 7 << 1 << 8 << arma::endr; + + arma::sp_mat testData(realData); + + arma::Col oTest(5); + oTest << 0 << 1 << 2 << 3 << 4; + + DTree testDTree(testData); + double alpha = testDTree.Grow(testData, oTest, false, 2, 1); + alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false); + + BOOST_REQUIRE_CLOSE(alpha, numeric_limits::max(), 1e-10); + BOOST_REQUIRE(testDTree.SubtreeLeaves() == 1); + + double rootError = -log(4.0) - log(7.0) - log(7.0); + + BOOST_REQUIRE_CLOSE(testDTree.LogNegError(), rootError, 1e-10); + BOOST_REQUIRE_CLOSE(testDTree.SubtreeLeavesLogNegError(), rootError, 1e-10); + BOOST_REQUIRE(testDTree.Left() == NULL); + BOOST_REQUIRE(testDTree.Right() == NULL); +} + +BOOST_AUTO_TEST_CASE(TestSparseComputeValue) +{ + arma::mat realData(3, 5); + + Log::Info << "OMP threads: " << omp_get_thread_num() << std::endl; + + realData << 4 << 5 << 7 << 3 << 5 << arma::endr + << 5 << 0 << 1 << 7 << 1 << arma::endr + << 5 << 6 << 7 << 1 << 8 << arma::endr; + + arma::vec _q1(3), _q2(3), _q3(3), _q4(3); + + _q1 << 4 << 2 << 2; + _q2 << 5 << 0.25 << 6; + _q3 << 5 << 3 << 7; + _q4 << 2 << 3 << 3; + + arma::sp_mat testData(realData); + arma::sp_vec q1(_q1), q2(_q2), q3(_q3), q4(_q4); + + arma::Col oTest(5); + oTest << 0 << 1 << 2 << 3 << 4; + + DTree testDTree(testData); + double alpha = testDTree.Grow(testData, oTest, false, 2, 1); + + double d1 = (2.0 / 5.0) / exp(log(4.0) + log(7.0) + log(4.5)); + double d2 = (1.0 / 5.0) / exp(log(4.0) + log(0.5) + log(2.5)); + double d3 = (2.0 / 5.0) / exp(log(4.0) + log(6.5) + log(2.5)); + + BOOST_REQUIRE_CLOSE(d1, testDTree.ComputeValue(q1), 1e-10); + BOOST_REQUIRE_CLOSE(d2, testDTree.ComputeValue(q2), 1e-10); + BOOST_REQUIRE_CLOSE(d3, testDTree.ComputeValue(q3), 1e-10); + BOOST_REQUIRE_CLOSE(0.0, testDTree.ComputeValue(q4), 1e-10); + + alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false); + + double d = 1.0 / exp(log(4.0) + log(7.0) + log(7.0)); + + BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q1), 1e-10); + BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q2), 1e-10); + BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q3), 1e-10); + BOOST_REQUIRE_CLOSE(0.0, testDTree.ComputeValue(q4), 1e-10); +} + /** * These are not yet implemented. * From 44fd0c0af8c482aeae0669c28f17d36ddc74a509 Mon Sep 17 00:00:00 2001 From: theJonan Date: Tue, 18 Oct 2016 12:20:03 +0300 Subject: [PATCH 10/25] - OMP test line removed. --- src/mlpack/tests/det_test.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mlpack/tests/det_test.cpp b/src/mlpack/tests/det_test.cpp index 09b96918ca6..4556d7d689a 100644 --- a/src/mlpack/tests/det_test.cpp +++ b/src/mlpack/tests/det_test.cpp @@ -403,8 +403,6 @@ BOOST_AUTO_TEST_CASE(TestSparseComputeValue) { arma::mat realData(3, 5); - Log::Info << "OMP threads: " << omp_get_thread_num() << std::endl; - realData << 4 << 5 << 7 << 3 << 5 << arma::endr << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; From aa2ad993b56e379261ff7eecfebb666d80ee8e64 Mon Sep 17 00:00:00 2001 From: theJonan Date: Thu, 13 Oct 2016 16:10:43 +0300 Subject: [PATCH 11/25] - DTree class templated. --- src/mlpack/methods/det/CMakeLists.txt | 2 +- src/mlpack/methods/det/dtree.hpp | 98 ++++------ .../methods/det/{dtree.cpp => dtree_impl.hpp} | 178 ++++++++++-------- 3 files changed, 141 insertions(+), 137 deletions(-) rename src/mlpack/methods/det/{dtree.cpp => dtree_impl.hpp} (77%) diff --git a/src/mlpack/methods/det/CMakeLists.txt b/src/mlpack/methods/det/CMakeLists.txt index 73b0474eeb3..66edbe6d620 100644 --- a/src/mlpack/methods/det/CMakeLists.txt +++ b/src/mlpack/methods/det/CMakeLists.txt @@ -4,7 +4,7 @@ set(SOURCES # the DET class dtree.hpp - dtree.cpp + dtree_impl.hpp # the util file dt_utils.hpp diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index 8e8bbdfe965..f39b5bf36fa 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -36,9 +36,17 @@ namespace det /** Density Estimation Trees */ { * } * @endcode */ +template class DTree { public: + /** + * The actual, underlying type we're working with + */ + typedef typename MatType::elem_type ElemType; + /** * Create an empty density estimation tree. */ @@ -52,8 +60,8 @@ class DTree * @param minVals Minimum values of the bounding box. * @param totalPoints Total number of points in the dataset. */ - DTree(const arma::vec& maxVals, - const arma::vec& minVals, + DTree(const VecType& maxVals, + const VecType& minVals, const size_t totalPoints); /** @@ -64,7 +72,7 @@ class DTree * * @param data Dataset to build tree on. */ - DTree(arma::mat& data); + DTree(MatType& data); /** * Create a child node of a density estimation tree given the bounding box @@ -78,8 +86,8 @@ class DTree * @param end End of points represented by this node in the data matrix. * @param error log-negative error of this node. */ - DTree(const arma::vec& maxVals, - const arma::vec& minVals, + DTree(const VecType& maxVals, + const VecType& minVals, const size_t start, const size_t end, const double logNegError); @@ -95,8 +103,8 @@ class DTree * @param start Start of points represented by this node in the data matrix. * @param end End of points represented by this node in the data matrix. */ - DTree(const arma::vec& maxVals, - const arma::vec& minVals, + DTree(const VecType& maxVals, + const VecType& minVals, const size_t totalPoints, const size_t start, const size_t end); @@ -114,7 +122,7 @@ class DTree * @param maxLeafSize Maximum size of a leaf. * @param minLeafSize Minimum size of a leaf. */ - double Grow(arma::mat& data, + double Grow(MatType& data, arma::Col& oldFromNew, const bool useVolReg = false, const size_t maxLeafSize = 10, @@ -137,16 +145,7 @@ class DTree * * @param query Point to estimate density of. */ - double ComputeValue(const arma::vec& query) const; - - /** - * Print the tree in a depth-first manner (this function is called - * recursively). - * - * @param fp File to write the tree to. - * @param level Level of the tree (should start at 0). - */ - void WriteTree(FILE *fp, const size_t level = 0) const; + double ComputeValue(const VecType& query) const; /** * Index the buckets for possible usage later; this results in every leaf in @@ -155,7 +154,7 @@ class DTree * * @param tag Tag for the next leaf; leave at 0 for the initial call. */ - int TagTree(const int tag = 0); + TagType TagTree(const TagType tag = 0); /** * Return the tag of the leaf containing the query. This is useful for @@ -163,7 +162,7 @@ class DTree * * @param query Query to search for. */ - int FindBucket(const arma::vec& query) const; + TagType FindBucket(const VecType& query) const; /** * Compute the variable importance of each dimension in the learned tree. @@ -183,7 +182,7 @@ class DTree /** * Return whether a query point is within the range of this node. */ - bool WithinRange(const arma::vec& query) const; + bool WithinRange(const VecType& query) const; private: // The indices in the complete set of points @@ -208,7 +207,7 @@ class DTree size_t splitDim; //! The split value on the splitting dimension for this node. - double splitValue; + ElemType splitValue; //! log-negative-L2-error of the node. double logNegError; @@ -229,7 +228,7 @@ class DTree double logVolume; //! The tag for the leaf, used for hashing points. - int bucketTag; + TagType bucketTag; //! Upper part of alpha sum; used for pruning. double alphaUpper; @@ -247,7 +246,7 @@ class DTree //! Return the split dimension of this node. size_t SplitDim() const { return splitDim; } //! Return the split value of this node. - double SplitValue() const { return splitValue; } + ElemType SplitValue() const { return splitValue; } //! Return the log negative error of this node. double LogNegError() const { return logNegError; } //! Return the log negative error of all descendants of this node. @@ -267,51 +266,24 @@ class DTree bool Root() const { return root; } //! Return the upper part of the alpha sum. double AlphaUpper() const { return alphaUpper; } + //! Return the current bucket's ID, if leaf, or -1 otherwise + TagType BucketTag() const { return subtreeLeaves == 1 ? bucketTag : -1; } //! Return the maximum values. - const arma::vec& MaxVals() const { return maxVals; } + const VecType& MaxVals() const { return maxVals; } //! Modify the maximum values. - arma::vec& MaxVals() { return maxVals; } + VecType& MaxVals() { return maxVals; } //! Return the minimum values. - const arma::vec& MinVals() const { return minVals; } + const VecType& MinVals() const { return minVals; } //! Modify the minimum values. - arma::vec& MinVals() { return minVals; } + VecType& MinVals() { return minVals; } /** * Serialize the density estimation tree. */ template - void Serialize(Archive& ar, const unsigned int /* version */) - { - using data::CreateNVP; - - ar & CreateNVP(start, "start"); - ar & CreateNVP(end, "end"); - ar & CreateNVP(maxVals, "maxVals"); - ar & CreateNVP(minVals, "minVals"); - ar & CreateNVP(splitDim, "splitDim"); - ar & CreateNVP(splitValue, "splitValue"); - ar & CreateNVP(logNegError, "logNegError"); - ar & CreateNVP(subtreeLeavesLogNegError, "subtreeLeavesLogNegError"); - ar & CreateNVP(subtreeLeaves, "subtreeLeaves"); - ar & CreateNVP(root, "root"); - ar & CreateNVP(ratio, "ratio"); - ar & CreateNVP(logVolume, "logVolume"); - ar & CreateNVP(bucketTag, "bucketTag"); - ar & CreateNVP(alphaUpper, "alphaUpper"); - - if (Archive::is_loading::value) - { - if (left) - delete left; - if (right) - delete right; - } - - ar & CreateNVP(left, "left"); - ar & CreateNVP(right, "right"); - } + void Serialize(Archive& ar, const unsigned int /* version */); private: @@ -320,9 +292,9 @@ class DTree /** * Find the dimension to split on. */ - bool FindSplit(const arma::mat& data, + bool FindSplit(const MatType& data, size_t& splitDim, - double& splitValue, + ElemType& splitValue, double& leftError, double& rightError, const size_t minLeafSize = 5) const; @@ -330,9 +302,9 @@ class DTree /** * Split the data, returning the number of points left of the split. */ - size_t SplitData(arma::mat& data, + size_t SplitData(MatType& data, const size_t splitDim, - const double splitValue, + const ElemType splitValue, arma::Col& oldFromNew) const; }; @@ -340,4 +312,6 @@ class DTree } // namespace det } // namespace mlpack +#include "dtree_impl.hpp" + #endif // MLPACK_METHODS_DET_DTREE_HPP diff --git a/src/mlpack/methods/det/dtree.cpp b/src/mlpack/methods/det/dtree_impl.hpp similarity index 77% rename from src/mlpack/methods/det/dtree.cpp rename to src/mlpack/methods/det/dtree_impl.hpp index c88d480b704..5ef37587e95 100644 --- a/src/mlpack/methods/det/dtree.cpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -12,7 +12,8 @@ using namespace mlpack; using namespace det; -DTree::DTree() : +template +DTree::DTree() : start(0), end(0), splitDim(size_t(-1)), @@ -31,9 +32,11 @@ DTree::DTree() : // Root node initializers -DTree::DTree(const arma::vec& maxVals, - const arma::vec& minVals, - const size_t totalPoints) : + +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t totalPoints) : start(0), end(totalPoints), maxVals(maxVals), @@ -52,7 +55,8 @@ DTree::DTree(const arma::vec& maxVals, right(NULL) { /* Nothing to do. */ } -DTree::DTree(arma::mat& data) : +template +DTree::DTree(MatType & data) : start(0), end(data.n_cols), splitDim(size_t(-1)), @@ -88,11 +92,12 @@ DTree::DTree(arma::mat& data) : // Non-root node initializers -DTree::DTree(const arma::vec& maxVals, - const arma::vec& minVals, - const size_t start, - const size_t end, - const double logNegError) : +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t start, + const size_t end, + const double logNegError) : start(start), end(end), maxVals(maxVals), @@ -111,11 +116,12 @@ DTree::DTree(const arma::vec& maxVals, right(NULL) { /* Nothing to do. */ } -DTree::DTree(const arma::vec& maxVals, - const arma::vec& minVals, - const size_t totalPoints, - const size_t start, - const size_t end) : +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t totalPoints, + const size_t start, + const size_t end) : start(start), end(end), maxVals(maxVals), @@ -134,7 +140,8 @@ DTree::DTree(const arma::vec& maxVals, right(NULL) { /* Nothing to do. */ } -DTree::~DTree() +template +DTree::~DTree() { delete left; delete right; @@ -142,7 +149,8 @@ DTree::~DTree() // This function computes the log-l2-negative-error of a given node from the // formula R(t) = log(|t|^2 / (N^2 V_t)). -double DTree::LogNegativeError(const size_t totalPoints) const +template +double DTree::LogNegativeError(const size_t totalPoints) const { // log(-|t|^2 / (N^2 V_t)) = log(-1) + 2 log(|t|) - 2 log(N) - log(V_t). double err = 2 * std::log((double) (end - start)) - @@ -162,12 +170,13 @@ double DTree::LogNegativeError(const size_t totalPoints) const // This function finds the best split with respect to the L2-error, by trying // all possible splits. The dataset is the full data set but the start and // end are used to obtain the point in this node. -bool DTree::FindSplit(const arma::mat& data, - size_t& splitDim, - double& splitValue, - double& leftError, - double& rightError, - const size_t minLeafSize) const +template +bool DTree::FindSplit(const MatType& data, + size_t& splitDim, + ElemType& splitValue, + double& leftError, + double& rightError, + const size_t minLeafSize) const { // Ensure the dimensionality of the data is the same as the dimensionality of // the bounding rectangle. @@ -180,12 +189,20 @@ bool DTree::FindSplit(const arma::mat& data, bool splitFound = false; // Loop through each dimension. - for (size_t dim = 0; dim < maxVals.n_elem; dim++) +#ifdef _WIN32 + #pragma omp parallel for default(none) \ + shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) + for (intmax_t dim = 0; fold < (intmax_t) maxVals.n_elem; ++dim) +#else + #pragma omp parallel for default(none) \ + shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) + for (size_t dim = 0; dim < maxVals.n_elem; ++dim) +#endif { // Have to deal with REAL, INTEGER, NOMINAL data differently, so we have to // think of how to do that... - const double min = minVals[dim]; - const double max = maxVals[dim]; + const ElemType min = minVals[dim]; + const ElemType max = maxVals[dim]; // If there is nothing to split in this dimension, move on. if (max - min == 0.0) @@ -197,7 +214,7 @@ bool DTree::FindSplit(const arma::mat& data, double minDimError = std::pow(points, 2.0) / (max - min); double dimLeftError = 0.0; // For -Wuninitialized. These variables will double dimRightError = 0.0; // always be set to something else before use. - double dimSplitValue = 0.0; + ElemType dimSplitValue = 0.0; // Find the log volume of all the other dimensions. double volumeWithoutDim = logVolume - std::log(max - min); @@ -214,7 +231,7 @@ bool DTree::FindSplit(const arma::mat& data, { // This makes sense for real continuous data. This kinda corrupts the // data and estimation if the data is ordinal. - const double split = (dimVec[i] + dimVec[i + 1]) / 2.0; + const ElemType split = (dimVec[i] + dimVec[i + 1]) / 2.0; if (split == dimVec[i]) continue; // We can't split here (two points are the same). @@ -269,10 +286,11 @@ bool DTree::FindSplit(const arma::mat& data, return splitFound; } -size_t DTree::SplitData(arma::mat& data, - const size_t splitDim, - const double splitValue, - arma::Col& oldFromNew) const +template +size_t DTree::SplitData(MatType& data, + const size_t splitDim, + const double splitValue, + arma::Col& oldFromNew) const { // Swap all columns such that any columns with value in dimension splitDim // less than or equal to splitValue are on the left side, and all others are @@ -303,11 +321,12 @@ size_t DTree::SplitData(arma::mat& data, } // Greedily expand the tree -double DTree::Grow(arma::mat& data, - arma::Col& oldFromNew, - const bool useVolReg, - const size_t maxLeafSize, - const size_t minLeafSize) +template +double DTree::Grow(MatType& data, + arma::Col& oldFromNew, + const bool useVolReg, + const size_t maxLeafSize, + const size_t minLeafSize) { Log::Assert(data.n_rows == maxVals.n_elem); Log::Assert(data.n_rows == minVals.n_elem); @@ -450,10 +469,10 @@ double DTree::Grow(arma::mat& data, } -double DTree::PruneAndUpdate(const double oldAlpha, - const size_t points, - const bool useVolReg) - +template +double DTree::PruneAndUpdate(const double oldAlpha, + const size_t points, + const bool useVolReg) { // Compute gT. if (subtreeLeaves == 1) // If we are a leaf... @@ -565,7 +584,8 @@ double DTree::PruneAndUpdate(const double oldAlpha, // // Future improvement: Open up the range with epsilons on both sides where // epsilon depends on the density near the boundary. -bool DTree::WithinRange(const arma::vec& query) const +template +bool DTree::WithinRange(const VecType& query) const { for (size_t i = 0; i < query.n_elem; ++i) if ((query[i] < minVals[i]) || (query[i] > maxVals[i])) @@ -575,7 +595,8 @@ bool DTree::WithinRange(const arma::vec& query) const } -double DTree::ComputeValue(const arma::vec& query) const +template +double DTree::ComputeValue(const VecType& query) const { Log::Assert(query.n_elem == maxVals.n_elem); @@ -607,35 +628,9 @@ double DTree::ComputeValue(const arma::vec& query) const } -void DTree::WriteTree(FILE *fp, const size_t level) const -{ - if (subtreeLeaves > 1) - { - fprintf(fp, "\n"); - for (size_t i = 0; i < level; ++i) - fprintf(fp, "|\t"); - fprintf(fp, "Var. %zu > %lg", splitDim, splitValue); - - right->WriteTree(fp, level + 1); - - fprintf(fp, "\n"); - for (size_t i = 0; i < level; ++i) - fprintf(fp, "|\t"); - fprintf(fp, "Var. %zu <= %lg ", splitDim, splitValue); - - left->WriteTree(fp, level); - } - else // If we are a leaf... - { - fprintf(fp, ": f(x)=%lg", std::exp(std::log(ratio) - logVolume)); - if (bucketTag != -1) - fprintf(fp, " BT:%d", bucketTag); - } -} - - // Index the buckets for possible usage later. -int DTree::TagTree(const int tag) +template +TagType DTree::TagTree(const TagType tag) { if (subtreeLeaves == 1) { @@ -650,7 +645,8 @@ int DTree::TagTree(const int tag) } -int DTree::FindBucket(const arma::vec& query) const +template +TagType DTree::FindBucket(const VecType& query) const { Log::Assert(query.n_elem == maxVals.n_elem); @@ -670,8 +666,8 @@ int DTree::FindBucket(const arma::vec& query) const } } - -void DTree::ComputeVariableImportance(arma::vec& importances) const +template +void DTree::ComputeVariableImportance(arma::vec& importances) const { // Clear and set to right size. importances.zeros(maxVals.n_elem); @@ -697,3 +693,37 @@ void DTree::ComputeVariableImportance(arma::vec& importances) const nodes.push(curNode.Right()); } } + +template +template +void DTree::Serialize(Archive& ar, const unsigned int /* version */) +{ + using data::CreateNVP; + + ar & CreateNVP(start, "start"); + ar & CreateNVP(end, "end"); + ar & CreateNVP(maxVals, "maxVals"); + ar & CreateNVP(minVals, "minVals"); + ar & CreateNVP(splitDim, "splitDim"); + ar & CreateNVP(splitValue, "splitValue"); + ar & CreateNVP(logNegError, "logNegError"); + ar & CreateNVP(subtreeLeavesLogNegError, "subtreeLeavesLogNegError"); + ar & CreateNVP(subtreeLeaves, "subtreeLeaves"); + ar & CreateNVP(root, "root"); + ar & CreateNVP(ratio, "ratio"); + ar & CreateNVP(logVolume, "logVolume"); + ar & CreateNVP(bucketTag, "bucketTag"); + ar & CreateNVP(alphaUpper, "alphaUpper"); + + if (Archive::is_loading::value) + { + if (left) + delete left; + if (right) + delete right; + } + + ar & CreateNVP(left, "left"); + ar & CreateNVP(right, "right"); +} + From 37a9e5047db2a83cacc1ceb68ef0d47cbebbef7b Mon Sep 17 00:00:00 2001 From: theJonan Date: Thu, 13 Oct 2016 19:53:50 +0300 Subject: [PATCH 12/25] - First successfull builtd. --- src/mlpack/methods/det/CMakeLists.txt | 4 - src/mlpack/methods/det/det_main.cpp | 4 +- src/mlpack/methods/det/dt_utils.hpp | 23 ++- .../det/{dt_utils.cpp => dt_utils_impl.hpp} | 59 +++--- src/mlpack/methods/det/dtree.hpp | 8 +- src/mlpack/methods/det/dtree_impl.hpp | 171 +++++++++++------- 6 files changed, 153 insertions(+), 116 deletions(-) rename src/mlpack/methods/det/{dt_utils.cpp => dt_utils_impl.hpp} (83%) diff --git a/src/mlpack/methods/det/CMakeLists.txt b/src/mlpack/methods/det/CMakeLists.txt index 66edbe6d620..4dd3bc32adb 100644 --- a/src/mlpack/methods/det/CMakeLists.txt +++ b/src/mlpack/methods/det/CMakeLists.txt @@ -5,10 +5,6 @@ set(SOURCES # the DET class dtree.hpp dtree_impl.hpp - - # the util file - dt_utils.hpp - dt_utils.cpp ) # add directory name to sources diff --git a/src/mlpack/methods/det/det_main.cpp b/src/mlpack/methods/det/det_main.cpp index f1b33bac5c5..26b394d7802 100644 --- a/src/mlpack/methods/det/det_main.cpp +++ b/src/mlpack/methods/det/det_main.cpp @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) << "(-T) is not specified." << endl; // Are we training a DET or loading from file? - DTree* tree; + DTree* tree; if (CLI::HasParam("training_file")) { const string trainSetFile = CLI::GetParam("training_file"); @@ -127,7 +127,7 @@ int main(int argc, char *argv[]) // Obtain the optimal tree. Timer::Start("det_training"); - tree = Trainer(trainingData, folds, regularization, maxLeafSize, + tree = Trainer(trainingData, folds, regularization, maxLeafSize, minLeafSize, ""); Timer::Stop("det_training"); diff --git a/src/mlpack/methods/det/dt_utils.hpp b/src/mlpack/methods/det/dt_utils.hpp index c20f78adac6..067e6fe31d7 100644 --- a/src/mlpack/methods/det/dt_utils.hpp +++ b/src/mlpack/methods/det/dt_utils.hpp @@ -25,8 +25,9 @@ namespace det { * @param numClasses Number of classes in dataset. * @param leafClassMembershipFile Name of file to print to (optional). */ -void PrintLeafMembership(DTree* dtree, - const arma::mat& data, +template +void PrintLeafMembership(DTree* dtree, + const MatType& data, const arma::Mat& labels, const size_t numClasses, const std::string leafClassMembershipFile = ""); @@ -39,7 +40,8 @@ void PrintLeafMembership(DTree* dtree, * @param dtree Density tree to use. * @param viFile Name of file to print to (optional). */ -void PrintVariableImportance(const DTree* dtree, +template +void PrintVariableImportance(const DTree* dtree, const std::string viFile = ""); /** @@ -54,14 +56,17 @@ void PrintVariableImportance(const DTree* dtree, * @param minLeafSize Minimum number of points allowed in a leaf. * @param unprunedTreeOutput Filename to print unpruned tree to (optional). */ -DTree* Trainer(arma::mat& dataset, - const size_t folds, - const bool useVolumeReg = false, - const size_t maxLeafSize = 10, - const size_t minLeafSize = 5, - const std::string unprunedTreeOutput = ""); +template +DTree* Trainer(MatType& dataset, + const size_t folds, + const bool useVolumeReg = false, + const size_t maxLeafSize = 10, + const size_t minLeafSize = 5, + const std::string unprunedTreeOutput = ""); } // namespace det } // namespace mlpack +#include "dt_utils_impl.hpp" + #endif // MLPACK_METHODS_DET_DT_UTILS_HPP diff --git a/src/mlpack/methods/det/dt_utils.cpp b/src/mlpack/methods/det/dt_utils_impl.hpp similarity index 83% rename from src/mlpack/methods/det/dt_utils.cpp rename to src/mlpack/methods/det/dt_utils_impl.hpp index b8946a92d85..4c057f76dc9 100644 --- a/src/mlpack/methods/det/dt_utils.cpp +++ b/src/mlpack/methods/det/dt_utils_impl.hpp @@ -10,22 +10,23 @@ using namespace mlpack; using namespace det; -void mlpack::det::PrintLeafMembership(DTree* dtree, - const arma::mat& data, +template +void mlpack::det::PrintLeafMembership(DTree* dtree, + const MatType& data, const arma::Mat& labels, const size_t numClasses, const std::string leafClassMembershipFile) { // Tag the leaves with numbers. - int numLeaves = dtree->TagTree(); + TagType numLeaves = dtree->TagTree(); arma::Mat table(numLeaves, (numClasses + 1)); table.zeros(); for (size_t i = 0; i < data.n_cols; i++) { - const arma::vec testPoint = data.unsafe_col(i); - const int leafTag = dtree->FindBucket(testPoint); + const VecType testPoint = data.unsafe_col(i); + const TagType leafTag = dtree->FindBucket(testPoint); const size_t label = labels[i]; table(leafTag, label) += 1; } @@ -57,8 +58,8 @@ void mlpack::det::PrintLeafMembership(DTree* dtree, return; } - -void mlpack::det::PrintVariableImportance(const DTree* dtree, +template +void mlpack::det::PrintVariableImportance(const DTree* dtree, const std::string viFile) { arma::vec imps; @@ -96,15 +97,16 @@ void mlpack::det::PrintVariableImportance(const DTree* dtree, // This function trains the optimal decision tree using the given number of // folds. -DTree* mlpack::det::Trainer(arma::mat& dataset, - const size_t folds, - const bool useVolumeReg, - const size_t maxLeafSize, - const size_t minLeafSize, - const std::string unprunedTreeOutput) +template +DTree* mlpack::det::Trainer(MatType& dataset, + const size_t folds, + const bool useVolumeReg, + const size_t maxLeafSize, + const size_t minLeafSize, + const std::string unprunedTreeOutput) { // Initialize the tree. - DTree dtree(dataset); + DTree dtree(dataset); // Prepare to grow the tree... arma::Col oldFromNew(dataset.n_cols); @@ -112,7 +114,7 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, oldFromNew[i] = i; // Save the dataset since it would be modified while growing the tree. - arma::mat newDataset(dataset); + MatType newDataset(dataset); // Growing the tree double oldAlpha = 0.0; @@ -148,8 +150,7 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, std::vector > prunedSequence; while (dtree.SubtreeLeaves() > 1) { - std::pair treeSeq(oldAlpha, - dtree.SubtreeLeavesLogNegError()); + std::pair treeSeq(oldAlpha, dtree.SubtreeLeavesLogNegError()); prunedSequence.push_back(treeSeq); oldAlpha = alpha; alpha = dtree.PruneAndUpdate(oldAlpha, dataset.n_cols, useVolumeReg); @@ -157,20 +158,18 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, // Some sanity checks. It seems that on some datasets, the error does not // increase as the tree is pruned but instead stays the same---hence the // "<=" in the final assert. - Log::Assert((alpha < std::numeric_limits::max()) || - (dtree.SubtreeLeaves() == 1)); + Log::Assert((alpha < std::numeric_limits::max()) || (dtree.SubtreeLeaves() == 1)); Log::Assert(alpha > oldAlpha); Log::Assert(dtree.SubtreeLeavesLogNegError() <= treeSeq.second); } - std::pair treeSeq(oldAlpha, - dtree.SubtreeLeavesLogNegError()); + std::pair treeSeq(oldAlpha, dtree.SubtreeLeavesLogNegError()); prunedSequence.push_back(treeSeq); Log::Info << prunedSequence.size() << " trees in the sequence; maximum alpha:" << " " << oldAlpha << "." << std::endl; - arma::mat cvData(dataset); + MatType cvData(dataset); size_t testSize = dataset.n_cols / folds; arma::vec regularizationConstants(prunedSequence.size()); @@ -194,8 +193,8 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, size_t start = fold * testSize; size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols); - arma::mat test = cvData.cols(start, end - 1); - arma::mat train(cvData.n_rows, cvData.n_cols - test.n_cols); + MatType test = cvData.cols(start, end - 1); + MatType train(cvData.n_rows, cvData.n_cols - test.n_cols); if (start == 0 && end < cvData.n_cols) { @@ -212,7 +211,7 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, } // Initialize the tree. - DTree cvDTree(train); + DTree cvDTree(train); // Getting ready to grow the tree... arma::Col cvOldFromNew(train.n_cols); @@ -252,13 +251,12 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, double cvVal = 0.0; for (size_t i = 0; i < test.n_cols; ++i) { - arma::vec testPoint = test.unsafe_col(i); + VecType testPoint = test.unsafe_col(i); cvVal += cvDTree.ComputeValue(testPoint); } if (prunedSequence.size() > 2) - cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / - (double) dataset.n_cols; + cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) dataset.n_cols; #pragma omp critical regularizationConstants += cvRegularizationConstants; @@ -285,7 +283,7 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, Log::Info << "Optimal alpha: " << optimalAlpha << "." << std::endl; // Initialize the tree. - DTree* dtreeOpt = new DTree(dataset); + DTree* dtreeOpt = new DTree(dataset); // Getting ready to grow the tree... for (size_t i = 0; i < oldFromNew.n_elem; i++) @@ -296,8 +294,7 @@ DTree* mlpack::det::Trainer(arma::mat& dataset, // Grow the tree. oldAlpha = -DBL_MAX; - alpha = dtreeOpt->Grow(newDataset, oldFromNew, useVolumeReg, maxLeafSize, - minLeafSize); + alpha = dtreeOpt->Grow(newDataset, oldFromNew, useVolumeReg, maxLeafSize, minLeafSize); // Prune with optimal alpha. while ((oldAlpha < optimalAlpha) && (dtreeOpt->SubtreeLeaves() > 1)) diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index f39b5bf36fa..f34750fd35a 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -73,7 +73,7 @@ class DTree * @param data Dataset to build tree on. */ DTree(MatType& data); - + /** * Create a child node of a density estimation tree given the bounding box * specified by maxVals and minVals, using the size given in start and end and @@ -154,7 +154,7 @@ class DTree * * @param tag Tag for the next leaf; leave at 0 for the initial call. */ - TagType TagTree(const TagType tag = 0); + TagType TagTree(const TagType& tag = 0); /** * Return the tag of the leaf containing the query. This is useful for @@ -271,13 +271,9 @@ class DTree //! Return the maximum values. const VecType& MaxVals() const { return maxVals; } - //! Modify the maximum values. - VecType& MaxVals() { return maxVals; } //! Return the minimum values. const VecType& MinVals() const { return minVals; } - //! Modify the minimum values. - VecType& MinVals() { return minVals; } /** * Serialize the density estimation tree. diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 5ef37587e95..58131fb42d3 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -1,4 +1,4 @@ - /** +/** * @file dtree.cpp * @author Parikshit Ram (pram@cc.gatech.edu) * @@ -8,16 +8,91 @@ */ #include "dtree.hpp" #include +#include using namespace mlpack; using namespace det; +namespace detail +{ + template + class DTreeSplit + { + public: + typedef DTreeSplit SplitInfo; + + template + static bool AssignToLeftNode(const VecType& point, + const SplitInfo& splitInfo) + { + return point[splitInfo.splitDimension] < splitInfo.splitVal; + } + + private: + ElemType splitVal; + size_t splitDimension; + }; + + /** + * We need that function, to be able to specialize it for sparse matrices + * in a way which is much faster then usual iteration. + */ + template + void ExtractMinMax(const MatType& data, + VecType& minVals, + VecType& maxVals) + { + // Initialize to first column; values will be overwritten if necessary. + maxVals = data.col(0); + minVals = data.col(0); + + // Loop over data to extract maximum and minimum values in each dimension. + for (size_t i = 1; i < data.n_cols; ++i) + { + for (size_t j = 0; j < data.n_rows; ++j) + { + if (data(j, i) > maxVals[j]) + maxVals[j] = data(j, i); + if (data(j, i) < minVals[j]) + minVals[j] = data(j, i); + } + } + } + + /** + * Here is the optimized specialization + */ + template + void ExtractMinMax(const arma::SpMat& data, + arma::SpCol& minVals, + arma::SpCol& maxVals) + { + // Initialize to first column; values will be overwritten if necessary. + maxVals = data.col(0); + minVals = data.col(0); + + typename arma::sp_mat::iterator dataEnd = data.end(); + + // Loop over data to extract maximum and minimum values in each dimension. + for (typename arma::sp_mat::iterator i = data.begin(); i != dataEnd; ++i) + { + size_t j = i.row(); + if (i.col() == 0) + continue; // we've already taken these values. + else if (*i > maxVals[j]) + maxVals[j] = *i; + else if (*i < minVals[j]) + minVals[j] = *i; + } + } +}; + template DTree::DTree() : start(0), end(0), splitDim(size_t(-1)), - splitValue(DBL_MAX), + splitValue(std::numeric_limits::max()), logNegError(-DBL_MAX), subtreeLeavesLogNegError(-DBL_MAX), subtreeLeaves(0), @@ -42,7 +117,7 @@ DTree::DTree(const VecType& maxVals, maxVals(maxVals), minVals(minVals), splitDim(size_t(-1)), - splitValue(DBL_MAX), + splitValue(std::numeric_limits::max()), logNegError(LogNegativeError(totalPoints)), subtreeLeavesLogNegError(-DBL_MAX), subtreeLeaves(0), @@ -60,7 +135,7 @@ DTree::DTree(MatType & data) : start(0), end(data.n_cols), splitDim(size_t(-1)), - splitValue(DBL_MAX), + splitValue(std::numeric_limits::max()), subtreeLeavesLogNegError(-DBL_MAX), subtreeLeaves(0), root(true), @@ -71,26 +146,10 @@ DTree::DTree(MatType & data) : left(NULL), right(NULL) { - // Initialize to first column; values will be overwritten if necessary. - maxVals = data.col(0); - minVals = data.col(0); - - // Loop over data to extract maximum and minimum values in each dimension. - for (size_t i = 1; i < data.n_cols; ++i) - { - for (size_t j = 0; j < data.n_rows; ++j) - { - if (data(j, i) > maxVals[j]) - maxVals[j] = data(j, i); - if (data(j, i) < minVals[j]) - minVals[j] = data(j, i); - } - } - + detail::ExtractMinMax(data, minVals, maxVals); logNegError = LogNegativeError(data.n_cols); } - // Non-root node initializers template DTree::DTree(const VecType& maxVals, @@ -103,7 +162,7 @@ DTree::DTree(const VecType& maxVals, maxVals(maxVals), minVals(minVals), splitDim(size_t(-1)), - splitValue(DBL_MAX), + splitValue(std::numeric_limits::max()), logNegError(logNegError), subtreeLeavesLogNegError(-DBL_MAX), subtreeLeaves(0), @@ -127,7 +186,7 @@ DTree::DTree(const VecType& maxVals, maxVals(maxVals), minVals(minVals), splitDim(size_t(-1)), - splitValue(DBL_MAX), + splitValue(std::numeric_limits::max()), logNegError(LogNegativeError(totalPoints)), subtreeLeavesLogNegError(-DBL_MAX), subtreeLeaves(0), @@ -156,12 +215,13 @@ double DTree::LogNegativeError(const size_t totalPoin double err = 2 * std::log((double) (end - start)) - 2 * std::log((double) totalPoints); - arma::vec valDiffs = maxVals - minVals; - for (size_t i = 0; i < maxVals.n_elem; ++i) + VecType valDiffs = maxVals - minVals; + typename VecType::iterator valEnd = valDiffs.end(); + for (typename VecType::iterator i = valDiffs.begin(); i != valEnd; ++i) { // Ignore very small dimensions to prevent overflow. - if (valDiffs[i] > 1e-50) - err -= std::log(valDiffs[i]); + if (*i > 1e-50) + err -= std::log(*i); } return err; @@ -191,11 +251,11 @@ bool DTree::FindSplit(const MatType& data, // Loop through each dimension. #ifdef _WIN32 #pragma omp parallel for default(none) \ - shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) - for (intmax_t dim = 0; fold < (intmax_t) maxVals.n_elem; ++dim) + shared(minError, splitFound, points, data, minVals, maxVals, minLeafSize, maxLeafSize) + for (intmax_t dim = 0; dim < (intmax_t) maxVals.n_elem; ++dim) #else #pragma omp parallel for default(none) \ - shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) + shared(minError, splitFound, points, data, minVals, maxVals, minLeafSize, maxLeafSize) for (size_t dim = 0; dim < maxVals.n_elem; ++dim) #endif { @@ -220,7 +280,7 @@ bool DTree::FindSplit(const MatType& data, double volumeWithoutDim = logVolume - std::log(max - min); // Get the values for the dimension. - arma::rowvec dimVec = data.row(dim).subvec(start, end - 1); + VecType dimVec = data.row(dim).subvec(start, end - 1); // Sort the values in ascending order. dimVec = arma::sort(dimVec); @@ -265,9 +325,9 @@ bool DTree::FindSplit(const MatType& data, } } - double actualMinDimError = std::log(minDimError) - - 2 * std::log((double) data.n_cols) - volumeWithoutDim; + double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; +#pragma omp critical if ((actualMinDimError > minError) && dimSplitFound) { // Calculate actual error (in logspace) by adding terms back to our @@ -275,10 +335,8 @@ bool DTree::FindSplit(const MatType& data, minError = actualMinDimError; splitDim = dim; splitValue = dimSplitValue; - leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) - - volumeWithoutDim; - rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) - - volumeWithoutDim; + leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; + rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; splitFound = true; } // end if better split found in this dimension. } @@ -289,7 +347,7 @@ bool DTree::FindSplit(const MatType& data, template size_t DTree::SplitData(MatType& data, const size_t splitDim, - const double splitValue, + const ElemType splitValue, arma::Col& oldFromNew) const { // Swap all columns such that any columns with value in dimension splitDim @@ -310,7 +368,7 @@ size_t DTree::SplitData(MatType& data, data.swap_cols(left, right); - // Store the mapping from old to new. + // Store the mapping from old to new. Do not put std::swap here... const size_t tmp = oldFromNew[left]; oldFromNew[left] = oldFromNew[right]; oldFromNew[right] = tmp; @@ -353,6 +411,7 @@ double DTree::Grow(MatType& data, { // Move the data around for the children to have points in a node lie // contiguously (to increase efficiency during the training). +// const size_t splitIndex = splt::PerformSplit(data, start, end - start, ) const size_t splitIndex = SplitData(data, dim, splitValueTmp, oldFromNew); // Make max and min vals for the children. @@ -372,10 +431,8 @@ double DTree::Grow(MatType& data, left = new DTree(maxValsL, minValsL, start, splitIndex, leftError); right = new DTree(maxValsR, minValsR, splitIndex, end, rightError); - leftG = left->Grow(data, oldFromNew, useVolReg, maxLeafSize, - minLeafSize); - rightG = right->Grow(data, oldFromNew, useVolReg, maxLeafSize, - minLeafSize); + leftG = left->Grow(data, oldFromNew, useVolReg, maxLeafSize, minLeafSize); + rightG = right->Grow(data, oldFromNew, useVolReg, maxLeafSize, minLeafSize); // Store values of R(T~) and |T~|. subtreeLeaves = left->SubtreeLeaves() + right->SubtreeLeaves(); @@ -440,14 +497,12 @@ double DTree::Grow(MatType& data, if (right->SubtreeLeaves() > 1) { - const double exponent = 2 * std::log((double) data.n_cols) + logVolume + - right->AlphaUpper(); + const double exponent = 2 * std::log((double) data.n_cols) + logVolume + right->AlphaUpper(); tmpAlphaSum += std::exp(exponent); } - alphaUpper = std::log(tmpAlphaSum) - 2 * std::log((double) data.n_cols) - - logVolume; + alphaUpper = std::log(tmpAlphaSum) - 2 * std::log((double) data.n_cols) - logVolume; double gT; if (useVolReg) @@ -613,15 +668,8 @@ double DTree::ComputeValue(const VecType& query) cons } else { - if (query[splitDim] <= splitValue) - { - // If left subtree, go to left child. - return left->ComputeValue(query); - } - else // If right subtree, go to right child - { - return right->ComputeValue(query); - } + // Return either of the two children - left or right, depending on the splitValue + return (query[splitDim] <= splitValue) ? left->ComputeValue(query) : right->ComputeValue(query); } return 0.0; @@ -630,7 +678,7 @@ double DTree::ComputeValue(const VecType& query) cons // Index the buckets for possible usage later. template -TagType DTree::TagTree(const TagType tag) +TagType DTree::TagTree(const TagType& tag) { if (subtreeLeaves == 1) { @@ -654,15 +702,10 @@ TagType DTree::FindBucket(const VecType& query) const { return bucketTag; } - else if (query[splitDim] <= splitValue) - { - // If left subtree, go to left child. - return left->FindBucket(query); - } else { - // If right subtree, go to right child. - return right->FindBucket(query); + // Return the tag from either of the two children - left or right. + return (query[splitDim] <= splitValue) ? left->FindBucket(query) : right->FindBucket(query); } } From f1d446710081a05277b77d4fd2bb9307f14fe9ed Mon Sep 17 00:00:00 2001 From: theJonan Date: Fri, 14 Oct 2016 17:42:25 +0300 Subject: [PATCH 13/25] - DET changes propagated to tests. --- src/mlpack/methods/det/dtree_impl.hpp | 29 ++++++++++++++----------- src/mlpack/tests/det_test.cpp | 27 ++++++++++------------- src/mlpack/tests/serialization_test.cpp | 15 +++++++------ 3 files changed, 36 insertions(+), 35 deletions(-) diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 58131fb42d3..2261bbfb436 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -250,12 +250,12 @@ bool DTree::FindSplit(const MatType& data, // Loop through each dimension. #ifdef _WIN32 - #pragma omp parallel for default(none) \ - shared(minError, splitFound, points, data, minVals, maxVals, minLeafSize, maxLeafSize) + #pragma omp parallel for default(shared) \ + shared(splitValue, splitDim, data) for (intmax_t dim = 0; dim < (intmax_t) maxVals.n_elem; ++dim) #else - #pragma omp parallel for default(none) \ - shared(minError, splitFound, points, data, minVals, maxVals, minLeafSize, maxLeafSize) + #pragma omp parallel for default(shared) \ + shared(splitValue, splitDim, data) for (size_t dim = 0; dim < maxVals.n_elem; ++dim) #endif { @@ -327,17 +327,20 @@ bool DTree::FindSplit(const MatType& data, double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; -#pragma omp critical +#pragma omp atomic if ((actualMinDimError > minError) && dimSplitFound) { - // Calculate actual error (in logspace) by adding terms back to our - // estimate. - minError = actualMinDimError; - splitDim = dim; - splitValue = dimSplitValue; - leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; - rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; - splitFound = true; +#pragma omp critical DTreeFindUpdate + { + // Calculate actual error (in logspace) by adding terms back to our + // estimate. + minError = actualMinDimError; + splitDim = dim; + splitValue = dimSplitValue; + leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; + rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; + splitFound = true; + } } // end if better split found in this dimension. } diff --git a/src/mlpack/tests/det_test.cpp b/src/mlpack/tests/det_test.cpp index 7e32defa4d5..2b0ef3788a0 100644 --- a/src/mlpack/tests/det_test.cpp +++ b/src/mlpack/tests/det_test.cpp @@ -42,7 +42,7 @@ BOOST_AUTO_TEST_CASE(TestGetMaxMinVals) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree tree(testData); + DTree tree(testData); BOOST_REQUIRE_EQUAL(tree.maxVals[0], 7); BOOST_REQUIRE_EQUAL(tree.minVals[0], 3); @@ -57,7 +57,7 @@ BOOST_AUTO_TEST_CASE(TestComputeNodeError) arma::vec maxVals("7 7 8"); arma::vec minVals("3 0 1"); - DTree testDTree(maxVals, minVals, 5); + DTree testDTree(maxVals, minVals, 5); double trueNodeError = -log(4.0) - log(7.0) - log(7.0); BOOST_REQUIRE_CLOSE((double) testDTree.logNegError, trueNodeError, 1e-10); @@ -75,7 +75,7 @@ BOOST_AUTO_TEST_CASE(TestWithinRange) arma::vec maxVals("7 7 8"); arma::vec minVals("3 0 1"); - DTree testDTree(maxVals, minVals, 5); + DTree testDTree(maxVals, minVals, 5); arma::vec testQuery(3); testQuery << 4.5 << 2.5 << 2; @@ -95,11 +95,10 @@ BOOST_AUTO_TEST_CASE(TestFindSplit) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree testDTree(testData); + DTree testDTree(testData); size_t obDim, trueDim; - double trueLeftError, obLeftError, trueRightError, obRightError, - obSplit, trueSplit; + double trueLeftError, obLeftError, trueRightError, obRightError, obSplit, trueSplit; trueDim = 2; trueSplit = 5.5; @@ -107,8 +106,7 @@ BOOST_AUTO_TEST_CASE(TestFindSplit) trueRightError = 2 * log(3.0 / 5.0) - (log(7.0) + log(4.0) + log(2.5)); testDTree.logVolume = log(7.0) + log(4.0) + log(7.0); - BOOST_REQUIRE(testDTree.FindSplit(testData, obDim, obSplit, obLeftError, - obRightError, 1)); + BOOST_REQUIRE(testDTree.FindSplit(testData, obDim, obSplit, obLeftError, obRightError, 1)); BOOST_REQUIRE(trueDim == obDim); BOOST_REQUIRE_CLOSE(trueSplit, obSplit, 1e-10); @@ -125,7 +123,7 @@ BOOST_AUTO_TEST_CASE(TestSplitData) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree testDTree(testData); + DTree testDTree(testData); arma::Col oTest(5); oTest << 1 << 2 << 3 << 4 << 5; @@ -133,8 +131,7 @@ BOOST_AUTO_TEST_CASE(TestSplitData) size_t splitDim = 2; double trueSplitVal = 5.5; - size_t splitInd = testDTree.SplitData(testData, splitDim, trueSplitVal, - oTest); + size_t splitInd = testDTree.SplitData(testData, splitDim, trueSplitVal, oTest); BOOST_REQUIRE_EQUAL(splitInd, 2); // 2 points on left side. @@ -169,7 +166,7 @@ BOOST_AUTO_TEST_CASE(TestGrow) rlError = 2 * log(1.0 / 5.0) - (log(0.5) + log(4.0) + log(2.5)); rrError = 2 * log(2.0 / 5.0) - (log(6.5) + log(4.0) + log(2.5)); - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); BOOST_REQUIRE_EQUAL(oTest[0], 0); @@ -222,7 +219,7 @@ BOOST_AUTO_TEST_CASE(TestPruneAndUpdate) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false); @@ -255,7 +252,7 @@ BOOST_AUTO_TEST_CASE(TestComputeValue) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); double d1 = (2.0 / 5.0) / exp(log(4.0) + log(7.0) + log(4.5)); @@ -298,7 +295,7 @@ BOOST_AUTO_TEST_CASE(TestVariableImportance) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); testDTree.Grow(testData, oTest, false, 2, 1); arma::vec imps; diff --git a/src/mlpack/tests/serialization_test.cpp b/src/mlpack/tests/serialization_test.cpp index e6aecc78b43..d7ea76e74fb 100644 --- a/src/mlpack/tests/serialization_test.cpp +++ b/src/mlpack/tests/serialization_test.cpp @@ -853,18 +853,19 @@ BOOST_AUTO_TEST_CASE(SoftmaxRegressionTest) BOOST_AUTO_TEST_CASE(DETTest) { using det::DTree; + typedef DTree DTreeX; // Create a density estimation tree on a random dataset. arma::mat dataset = arma::randu(25, 5000); - DTree tree(dataset); + DTreeX tree(dataset); arma::mat otherDataset = arma::randu(5, 100); - DTree xmlTree, binaryTree, textTree(otherDataset); + DTreeX xmlTree, binaryTree, textTree(otherDataset); SerializeObjectAll(tree, xmlTree, binaryTree, textTree); - std::stack stack, xmlStack, binaryStack, textStack; + std::stack stack, xmlStack, binaryStack, textStack; stack.push(&tree); xmlStack.push(&xmlTree); binaryStack.push(&binaryTree); @@ -873,10 +874,10 @@ BOOST_AUTO_TEST_CASE(DETTest) while (!stack.empty()) { // Get the top node from the stack. - DTree* node = stack.top(); - DTree* xmlNode = xmlStack.top(); - DTree* binaryNode = binaryStack.top(); - DTree* textNode = textStack.top(); + DTreeX* node = stack.top(); + DTreeX* xmlNode = xmlStack.top(); + DTreeX* binaryNode = binaryStack.top(); + DTreeX* textNode = textStack.top(); stack.pop(); xmlStack.pop(); From 45ff5baa3cf6606ccff5dddb3fbff55df261e9a3 Mon Sep 17 00:00:00 2001 From: theJonan Date: Fri, 14 Oct 2016 23:06:19 +0300 Subject: [PATCH 14/25] - DET templating ready and tests passing. --- .../core/arma_extend/Mat_extra_bones.hpp | 9 + .../core/arma_extend/SpMat_extra_bones.hpp | 8 + src/mlpack/methods/det/det_main.cpp | 5 +- src/mlpack/methods/det/dt_utils.hpp | 22 +-- src/mlpack/methods/det/dt_utils_impl.hpp | 22 +-- src/mlpack/methods/det/dtree.hpp | 2 +- src/mlpack/methods/det/dtree_impl.hpp | 187 +++++++----------- src/mlpack/tests/det_test.cpp | 18 +- src/mlpack/tests/serialization_test.cpp | 2 +- 9 files changed, 126 insertions(+), 149 deletions(-) diff --git a/src/mlpack/core/arma_extend/Mat_extra_bones.hpp b/src/mlpack/core/arma_extend/Mat_extra_bones.hpp index e09f5f55b0b..f4f25d610a4 100644 --- a/src/mlpack/core/arma_extend/Mat_extra_bones.hpp +++ b/src/mlpack/core/arma_extend/Mat_extra_bones.hpp @@ -12,6 +12,15 @@ template void serialize(Archive& ar, const unsigned int version); +/** + * These will help us refer the proper vector / column types, only with + * specifying the matrix type we want to use. + */ + +typedef Col vec_type; +typedef Col col_type; +typedef Row row_type; + /* * Add row_col_iterator and row_col_const_iterator to arma::Mat. */ diff --git a/src/mlpack/core/arma_extend/SpMat_extra_bones.hpp b/src/mlpack/core/arma_extend/SpMat_extra_bones.hpp index d3c18dec1e5..a5d274c4b96 100644 --- a/src/mlpack/core/arma_extend/SpMat_extra_bones.hpp +++ b/src/mlpack/core/arma_extend/SpMat_extra_bones.hpp @@ -16,6 +16,14 @@ template void serialize(Archive& ar, const unsigned int version); +/** + * These will help us refer the proper vector / column types, only with + * specifying the matrix type we want to use. + */ +typedef SpCol vec_type; +typedef SpCol col_type; +typedef SpRow row_type; + /* * Extra functions for SpMat * Adding definition of row_col_iterator to generalize with Mat::row_col_iterator diff --git a/src/mlpack/methods/det/det_main.cpp b/src/mlpack/methods/det/det_main.cpp index 26b394d7802..16ffa25b774 100644 --- a/src/mlpack/methods/det/det_main.cpp +++ b/src/mlpack/methods/det/det_main.cpp @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) << "(-T) is not specified." << endl; // Are we training a DET or loading from file? - DTree* tree; + DTree* tree; if (CLI::HasParam("training_file")) { const string trainSetFile = CLI::GetParam("training_file"); @@ -127,8 +127,7 @@ int main(int argc, char *argv[]) // Obtain the optimal tree. Timer::Start("det_training"); - tree = Trainer(trainingData, folds, regularization, maxLeafSize, - minLeafSize, ""); + tree = Trainer(trainingData, folds, regularization, maxLeafSize, minLeafSize, ""); Timer::Stop("det_training"); // Compute training set estimates, if desired. diff --git a/src/mlpack/methods/det/dt_utils.hpp b/src/mlpack/methods/det/dt_utils.hpp index 067e6fe31d7..3535eaeadd7 100644 --- a/src/mlpack/methods/det/dt_utils.hpp +++ b/src/mlpack/methods/det/dt_utils.hpp @@ -25,8 +25,8 @@ namespace det { * @param numClasses Number of classes in dataset. * @param leafClassMembershipFile Name of file to print to (optional). */ -template -void PrintLeafMembership(DTree* dtree, +template +void PrintLeafMembership(DTree* dtree, const MatType& data, const arma::Mat& labels, const size_t numClasses, @@ -40,8 +40,8 @@ void PrintLeafMembership(DTree* dtree, * @param dtree Density tree to use. * @param viFile Name of file to print to (optional). */ -template -void PrintVariableImportance(const DTree* dtree, +template +void PrintVariableImportance(const DTree* dtree, const std::string viFile = ""); /** @@ -56,13 +56,13 @@ void PrintVariableImportance(const DTree* dtree, * @param minLeafSize Minimum number of points allowed in a leaf. * @param unprunedTreeOutput Filename to print unpruned tree to (optional). */ -template -DTree* Trainer(MatType& dataset, - const size_t folds, - const bool useVolumeReg = false, - const size_t maxLeafSize = 10, - const size_t minLeafSize = 5, - const std::string unprunedTreeOutput = ""); +template +DTree* Trainer(MatType& dataset, + const size_t folds, + const bool useVolumeReg = false, + const size_t maxLeafSize = 10, + const size_t minLeafSize = 5, + const std::string unprunedTreeOutput = ""); } // namespace det } // namespace mlpack diff --git a/src/mlpack/methods/det/dt_utils_impl.hpp b/src/mlpack/methods/det/dt_utils_impl.hpp index 4c057f76dc9..cad52891554 100644 --- a/src/mlpack/methods/det/dt_utils_impl.hpp +++ b/src/mlpack/methods/det/dt_utils_impl.hpp @@ -10,8 +10,8 @@ using namespace mlpack; using namespace det; -template -void mlpack::det::PrintLeafMembership(DTree* dtree, +template +void mlpack::det::PrintLeafMembership(DTree* dtree, const MatType& data, const arma::Mat& labels, const size_t numClasses, @@ -25,7 +25,7 @@ void mlpack::det::PrintLeafMembership(DTree* dtree, for (size_t i = 0; i < data.n_cols; i++) { - const VecType testPoint = data.unsafe_col(i); + const typename MatType::vec_type testPoint = data.unsafe_col(i); const TagType leafTag = dtree->FindBucket(testPoint); const size_t label = labels[i]; table(leafTag, label) += 1; @@ -58,8 +58,8 @@ void mlpack::det::PrintLeafMembership(DTree* dtree, return; } -template -void mlpack::det::PrintVariableImportance(const DTree* dtree, +template +void mlpack::det::PrintVariableImportance(const DTree* dtree, const std::string viFile) { arma::vec imps; @@ -97,8 +97,8 @@ void mlpack::det::PrintVariableImportance(const DTree // This function trains the optimal decision tree using the given number of // folds. -template -DTree* mlpack::det::Trainer(MatType& dataset, +template +DTree* mlpack::det::Trainer(MatType& dataset, const size_t folds, const bool useVolumeReg, const size_t maxLeafSize, @@ -106,7 +106,7 @@ DTree* mlpack::det::Trainer(MatType& dataset, const std::string unprunedTreeOutput) { // Initialize the tree. - DTree dtree(dataset); + DTree dtree(dataset); // Prepare to grow the tree... arma::Col oldFromNew(dataset.n_cols); @@ -211,7 +211,7 @@ DTree* mlpack::det::Trainer(MatType& dataset, } // Initialize the tree. - DTree cvDTree(train); + DTree cvDTree(train); // Getting ready to grow the tree... arma::Col cvOldFromNew(train.n_cols); @@ -251,7 +251,7 @@ DTree* mlpack::det::Trainer(MatType& dataset, double cvVal = 0.0; for (size_t i = 0; i < test.n_cols; ++i) { - VecType testPoint = test.unsafe_col(i); + typename MatType::vec_type testPoint = test.unsafe_col(i); cvVal += cvDTree.ComputeValue(testPoint); } @@ -283,7 +283,7 @@ DTree* mlpack::det::Trainer(MatType& dataset, Log::Info << "Optimal alpha: " << optimalAlpha << "." << std::endl; // Initialize the tree. - DTree* dtreeOpt = new DTree(dataset); + DTree* dtreeOpt = new DTree(dataset); // Getting ready to grow the tree... for (size_t i = 0; i < oldFromNew.n_elem; i++) diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index f34750fd35a..62342879b5a 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -37,7 +37,6 @@ namespace det /** Density Estimation Trees */ { * @endcode */ template class DTree { @@ -46,6 +45,7 @@ class DTree * The actual, underlying type we're working with */ typedef typename MatType::elem_type ElemType; + typedef typename MatType::vec_type VecType; /** * Create an empty density estimation tree. diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 2261bbfb436..b4560242d17 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -32,63 +32,10 @@ namespace detail ElemType splitVal; size_t splitDimension; }; - - /** - * We need that function, to be able to specialize it for sparse matrices - * in a way which is much faster then usual iteration. - */ - template - void ExtractMinMax(const MatType& data, - VecType& minVals, - VecType& maxVals) - { - // Initialize to first column; values will be overwritten if necessary. - maxVals = data.col(0); - minVals = data.col(0); - - // Loop over data to extract maximum and minimum values in each dimension. - for (size_t i = 1; i < data.n_cols; ++i) - { - for (size_t j = 0; j < data.n_rows; ++j) - { - if (data(j, i) > maxVals[j]) - maxVals[j] = data(j, i); - if (data(j, i) < minVals[j]) - minVals[j] = data(j, i); - } - } - } - - /** - * Here is the optimized specialization - */ - template - void ExtractMinMax(const arma::SpMat& data, - arma::SpCol& minVals, - arma::SpCol& maxVals) - { - // Initialize to first column; values will be overwritten if necessary. - maxVals = data.col(0); - minVals = data.col(0); - - typename arma::sp_mat::iterator dataEnd = data.end(); - - // Loop over data to extract maximum and minimum values in each dimension. - for (typename arma::sp_mat::iterator i = data.begin(); i != dataEnd; ++i) - { - size_t j = i.row(); - if (i.col() == 0) - continue; // we've already taken these values. - else if (*i > maxVals[j]) - maxVals[j] = *i; - else if (*i < minVals[j]) - minVals[j] = *i; - } - } }; -template -DTree::DTree() : +template +DTree::DTree() : start(0), end(0), splitDim(size_t(-1)), @@ -108,10 +55,10 @@ DTree::DTree() : // Root node initializers -template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, - const size_t totalPoints) : +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t totalPoints) : start(0), end(totalPoints), maxVals(maxVals), @@ -130,8 +77,8 @@ DTree::DTree(const VecType& maxVals, right(NULL) { /* Nothing to do. */ } -template -DTree::DTree(MatType & data) : +template +DTree::DTree(MatType & data) : start(0), end(data.n_cols), splitDim(size_t(-1)), @@ -146,17 +93,31 @@ DTree::DTree(MatType & data) : left(NULL), right(NULL) { - detail::ExtractMinMax(data, minVals, maxVals); + maxVals = data.col(0); + minVals = data.col(0); + + typename MatType::row_col_iterator dataEnd = data.end_row_col(); + + // Loop over data to extract maximum and minimum values in each dimension. + for (typename MatType::row_col_iterator i = data.begin_row_col(); i != dataEnd; ++i) + { + size_t j = i.row(); + if (*i > maxVals[j]) + maxVals[j] = *i; + else if (*i < minVals[j]) + minVals[j] = *i; + } + logNegError = LogNegativeError(data.n_cols); } // Non-root node initializers -template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, - const size_t start, - const size_t end, - const double logNegError) : +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t start, + const size_t end, + const double logNegError) : start(start), end(end), maxVals(maxVals), @@ -175,12 +136,12 @@ DTree::DTree(const VecType& maxVals, right(NULL) { /* Nothing to do. */ } -template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, - const size_t totalPoints, - const size_t start, - const size_t end) : +template +DTree::DTree(const VecType& maxVals, + const VecType& minVals, + const size_t totalPoints, + const size_t start, + const size_t end) : start(start), end(end), maxVals(maxVals), @@ -199,8 +160,8 @@ DTree::DTree(const VecType& maxVals, right(NULL) { /* Nothing to do. */ } -template -DTree::~DTree() +template +DTree::~DTree() { delete left; delete right; @@ -208,8 +169,8 @@ DTree::~DTree() // This function computes the log-l2-negative-error of a given node from the // formula R(t) = log(|t|^2 / (N^2 V_t)). -template -double DTree::LogNegativeError(const size_t totalPoints) const +template +double DTree::LogNegativeError(const size_t totalPoints) const { // log(-|t|^2 / (N^2 V_t)) = log(-1) + 2 log(|t|) - 2 log(N) - log(V_t). double err = 2 * std::log((double) (end - start)) - @@ -230,13 +191,13 @@ double DTree::LogNegativeError(const size_t totalPoin // This function finds the best split with respect to the L2-error, by trying // all possible splits. The dataset is the full data set but the start and // end are used to obtain the point in this node. -template -bool DTree::FindSplit(const MatType& data, - size_t& splitDim, - ElemType& splitValue, - double& leftError, - double& rightError, - const size_t minLeafSize) const +template +bool DTree::FindSplit(const MatType& data, + size_t& splitDim, + ElemType& splitValue, + double& leftError, + double& rightError, + const size_t minLeafSize) const { // Ensure the dimensionality of the data is the same as the dimensionality of // the bounding rectangle. @@ -280,7 +241,7 @@ bool DTree::FindSplit(const MatType& data, double volumeWithoutDim = logVolume - std::log(max - min); // Get the values for the dimension. - VecType dimVec = data.row(dim).subvec(start, end - 1); + typename MatType::row_type dimVec = data.row(dim).subvec(start, end - 1); // Sort the values in ascending order. dimVec = arma::sort(dimVec); @@ -347,11 +308,11 @@ bool DTree::FindSplit(const MatType& data, return splitFound; } -template -size_t DTree::SplitData(MatType& data, - const size_t splitDim, - const ElemType splitValue, - arma::Col& oldFromNew) const +template +size_t DTree::SplitData(MatType& data, + const size_t splitDim, + const ElemType splitValue, + arma::Col& oldFromNew) const { // Swap all columns such that any columns with value in dimension splitDim // less than or equal to splitValue are on the left side, and all others are @@ -382,12 +343,12 @@ size_t DTree::SplitData(MatType& data, } // Greedily expand the tree -template -double DTree::Grow(MatType& data, - arma::Col& oldFromNew, - const bool useVolReg, - const size_t maxLeafSize, - const size_t minLeafSize) +template +double DTree::Grow(MatType& data, + arma::Col& oldFromNew, + const bool useVolReg, + const size_t maxLeafSize, + const size_t minLeafSize) { Log::Assert(data.n_rows == maxVals.n_elem); Log::Assert(data.n_rows == minVals.n_elem); @@ -527,10 +488,10 @@ double DTree::Grow(MatType& data, } -template -double DTree::PruneAndUpdate(const double oldAlpha, - const size_t points, - const bool useVolReg) +template +double DTree::PruneAndUpdate(const double oldAlpha, + const size_t points, + const bool useVolReg) { // Compute gT. if (subtreeLeaves == 1) // If we are a leaf... @@ -642,8 +603,8 @@ double DTree::PruneAndUpdate(const double oldAlpha, // // Future improvement: Open up the range with epsilons on both sides where // epsilon depends on the density near the boundary. -template -bool DTree::WithinRange(const VecType& query) const +template +bool DTree::WithinRange(const VecType& query) const { for (size_t i = 0; i < query.n_elem; ++i) if ((query[i] < minVals[i]) || (query[i] > maxVals[i])) @@ -653,8 +614,8 @@ bool DTree::WithinRange(const VecType& query) const } -template -double DTree::ComputeValue(const VecType& query) const +template +double DTree::ComputeValue(const VecType& query) const { Log::Assert(query.n_elem == maxVals.n_elem); @@ -680,8 +641,8 @@ double DTree::ComputeValue(const VecType& query) cons // Index the buckets for possible usage later. -template -TagType DTree::TagTree(const TagType& tag) +template +TagType DTree::TagTree(const TagType& tag) { if (subtreeLeaves == 1) { @@ -696,8 +657,8 @@ TagType DTree::TagTree(const TagType& tag) } -template -TagType DTree::FindBucket(const VecType& query) const +template +TagType DTree::FindBucket(const VecType& query) const { Log::Assert(query.n_elem == maxVals.n_elem); @@ -712,8 +673,8 @@ TagType DTree::FindBucket(const VecType& query) const } } -template -void DTree::ComputeVariableImportance(arma::vec& importances) const +template +void DTree::ComputeVariableImportance(arma::vec& importances) const { // Clear and set to right size. importances.zeros(maxVals.n_elem); @@ -740,9 +701,9 @@ void DTree::ComputeVariableImportance(arma::vec& impo } } -template +template template -void DTree::Serialize(Archive& ar, const unsigned int /* version */) +void DTree::Serialize(Archive& ar, const unsigned int /* version */) { using data::CreateNVP; diff --git a/src/mlpack/tests/det_test.cpp b/src/mlpack/tests/det_test.cpp index 2b0ef3788a0..33659842f19 100644 --- a/src/mlpack/tests/det_test.cpp +++ b/src/mlpack/tests/det_test.cpp @@ -42,7 +42,7 @@ BOOST_AUTO_TEST_CASE(TestGetMaxMinVals) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree tree(testData); + DTree tree(testData); BOOST_REQUIRE_EQUAL(tree.maxVals[0], 7); BOOST_REQUIRE_EQUAL(tree.minVals[0], 3); @@ -57,7 +57,7 @@ BOOST_AUTO_TEST_CASE(TestComputeNodeError) arma::vec maxVals("7 7 8"); arma::vec minVals("3 0 1"); - DTree testDTree(maxVals, minVals, 5); + DTree testDTree(maxVals, minVals, 5); double trueNodeError = -log(4.0) - log(7.0) - log(7.0); BOOST_REQUIRE_CLOSE((double) testDTree.logNegError, trueNodeError, 1e-10); @@ -75,7 +75,7 @@ BOOST_AUTO_TEST_CASE(TestWithinRange) arma::vec maxVals("7 7 8"); arma::vec minVals("3 0 1"); - DTree testDTree(maxVals, minVals, 5); + DTree testDTree(maxVals, minVals, 5); arma::vec testQuery(3); testQuery << 4.5 << 2.5 << 2; @@ -95,7 +95,7 @@ BOOST_AUTO_TEST_CASE(TestFindSplit) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree testDTree(testData); + DTree testDTree(testData); size_t obDim, trueDim; double trueLeftError, obLeftError, trueRightError, obRightError, obSplit, trueSplit; @@ -123,7 +123,7 @@ BOOST_AUTO_TEST_CASE(TestSplitData) << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; - DTree testDTree(testData); + DTree testDTree(testData); arma::Col oTest(5); oTest << 1 << 2 << 3 << 4 << 5; @@ -166,7 +166,7 @@ BOOST_AUTO_TEST_CASE(TestGrow) rlError = 2 * log(1.0 / 5.0) - (log(0.5) + log(4.0) + log(2.5)); rrError = 2 * log(2.0 / 5.0) - (log(6.5) + log(4.0) + log(2.5)); - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); BOOST_REQUIRE_EQUAL(oTest[0], 0); @@ -219,7 +219,7 @@ BOOST_AUTO_TEST_CASE(TestPruneAndUpdate) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false); @@ -252,7 +252,7 @@ BOOST_AUTO_TEST_CASE(TestComputeValue) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); double alpha = testDTree.Grow(testData, oTest, false, 2, 1); double d1 = (2.0 / 5.0) / exp(log(4.0) + log(7.0) + log(4.5)); @@ -295,7 +295,7 @@ BOOST_AUTO_TEST_CASE(TestVariableImportance) arma::Col oTest(5); oTest << 0 << 1 << 2 << 3 << 4; - DTree testDTree(testData); + DTree testDTree(testData); testDTree.Grow(testData, oTest, false, 2, 1); arma::vec imps; diff --git a/src/mlpack/tests/serialization_test.cpp b/src/mlpack/tests/serialization_test.cpp index d7ea76e74fb..64769df90b0 100644 --- a/src/mlpack/tests/serialization_test.cpp +++ b/src/mlpack/tests/serialization_test.cpp @@ -853,7 +853,7 @@ BOOST_AUTO_TEST_CASE(SoftmaxRegressionTest) BOOST_AUTO_TEST_CASE(DETTest) { using det::DTree; - typedef DTree DTreeX; + typedef DTree DTreeX; // Create a density estimation tree on a random dataset. arma::mat dataset = arma::randu(25, 5000); From 9bee58e15cc243071f6e8230ae496057f524221b Mon Sep 17 00:00:00 2001 From: theJonan Date: Mon, 17 Oct 2016 14:30:48 +0300 Subject: [PATCH 15/25] - More sparse-matrix migration steps. --- src/mlpack/methods/det/dtree.hpp | 4 +- src/mlpack/methods/det/dtree_impl.hpp | 106 ++++++++++++++++++++++---- 2 files changed, 95 insertions(+), 15 deletions(-) diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index 62342879b5a..9a2b1751f87 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -199,9 +199,9 @@ class DTree size_t end; //! Upper half of bounding box for this node. - arma::vec maxVals; + VecType maxVals; //! Lower half of bounding box for this node. - arma::vec minVals; + VecType minVals; //! The splitting dimension for this node. size_t splitDim; diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index b4560242d17..b683ffddf7a 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -8,6 +8,8 @@ */ #include "dtree.hpp" #include +#include +#include #include using namespace mlpack; @@ -32,6 +34,68 @@ namespace detail ElemType splitVal; size_t splitDimension; }; + + /** + * Get the values for the dimension and sort them. The old implementation: + * dimVec = data.row(dim).subvec(start, end - 1); + * dimVec = arma::sort(dimVec); + * was quite inefficient, due to many (3) vector copy operations. This could be a + * problem especially for sparse matrices. That's why they have custom implementation. + */ + template + typename MatType::row_type ExtractSortedRow(const MatType& data, + size_t dim, + size_t start, + size_t end) + { + typedef typename MatType::elem_type ElemType; + + assert(start < end); + + typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1)); + std::sort(dimVec.begin(), dimVec.end()); + return dimVec; + } + +// template +// std::vector > ExtractSortedRow(const arma::SpMat& data, +// size_t dim, +// size_t start, +// size_t end, +// size_t padding) +// { +// typedef SortedItem SortedType; +// +// assert(padding > 0); +// assert(start < end); +// +// arma::SpRow dimVec = data(dim, arma::span(start, end - 1)); +// typename arma::SpRow::iterator dimVecEnd = dimVec.end(); +// +// // Build the vector to be sorted with values in ascending order. +// std::vector sortedDim = std::vector(); +// sortedDim.reserve(dimVec.n_elem / 2); +// +// // Prepare these for the iteration. +// ElemType lastVal = 0; +// --padding; +// +// // Iterate over the row and put only different values, also skipping +// // `padding` number of elements from both sides of the row. +// for (typename MatType::row_col_iterator di = dimVec.begin_row_col(); di != dimVecEnd; ++di) +// { +// if (di.col() < padding || di.col() >= dimVec.n_elem - padding) +// continue; +// // if (*di == lastVal && sortedDim.size() > 0) +// // continue; +// +// sortedDim.push_back(lastVal = *di); +// } +// +// std::sort(sortedDim.begin(), sortedDim.end()); +// return sortedDim; +// } + }; template @@ -239,22 +303,38 @@ bool DTree::FindSplit(const MatType& data, // Find the log volume of all the other dimensions. double volumeWithoutDim = logVolume - std::log(max - min); - - // Get the values for the dimension. - typename MatType::row_type dimVec = data.row(dim).subvec(start, end - 1); - - // Sort the values in ascending order. - dimVec = arma::sort(dimVec); + + // Get a sorted version of the dimension in interest, + // from the given samples range. This is the most expensive step. + typename MatType::row_type dimVec = detail::ExtractSortedRow(data, + dim, + start, + end); + + typename MatType::row_col_iterator dimVecEnd = dimVec.end_row_col(); + typename MatType::row_col_iterator dI = dimVec.begin_row_col(); // Find the best split for this dimension. We need to figure out why // there are spikes if this minLeafSize is enforced here... - for (size_t i = minLeafSize - 1; i < dimVec.n_elem - minLeafSize; ++i) + for (;;) { + const size_t position = dI.col(); + + if (position >= dimVec.n_cols - minLeafSize) + break; + if (position < minLeafSize - 1) + continue; + + ElemType split = *dI; + if (++dI == dimVecEnd) + break; // This means we have same values till the end => No split. + // This makes sense for real continuous data. This kinda corrupts the // data and estimation if the data is ordinal. - const ElemType split = (dimVec[i] + dimVec[i + 1]) / 2.0; - - if (split == dimVec[i]) + split += *dI; + split /= 2.0; + + if (split == *dI) continue; // We can't split here (two points are the same). // Another way of picking split is using this: @@ -263,7 +343,7 @@ bool DTree::FindSplit(const MatType& data, { // Ensure that the right node will have at least the minimum number of // points. - Log::Assert((points - i - 1) >= minLeafSize); + Log::Assert((points - position - 1) >= minLeafSize); // Now we have to see if the error will be reduced. Simple manipulation // of the error function gives us the condition we must satisfy: @@ -271,8 +351,8 @@ bool DTree::FindSplit(const MatType& data, // and because the volume is only dependent on the dimension we are // splitting, we can assume V_l is just the range of the left and V_r is // just the range of the right. - double negLeftError = std::pow(i + 1, 2.0) / (split - min); - double negRightError = std::pow(points - i - 1, 2.0) / (max - split); + double negLeftError = std::pow(position + 1, 2.0) / (split - min); + double negRightError = std::pow(points - position - 1, 2.0) / (max - split); // If this is better, take it. if ((negLeftError + negRightError) >= minDimError) From 6b3ae6c17583d99694ac35aa629a95ce01168ec2 Mon Sep 17 00:00:00 2001 From: theJonan Date: Mon, 17 Oct 2016 17:03:18 +0300 Subject: [PATCH 16/25] - First successfull SpMat build. --- src/mlpack/methods/det/dtree_impl.hpp | 104 +++++++++++++++----------- 1 file changed, 62 insertions(+), 42 deletions(-) diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index b683ffddf7a..16092b8034e 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -8,13 +8,52 @@ */ #include "dtree.hpp" #include -#include #include +#include #include using namespace mlpack; using namespace det; +namespace arma +{ + template + SpMat sort(const SpMat& data) + { + typedef std::vector ElemVectorType; + + // Construct the vector of values / locations... + ElemVectorType indexVec(data.begin(), data.end()); + + // ... and sort it! + std::sort(indexVec.begin(), indexVec.end()); + + // Now prepare the structures for the batch construction of the + // sorted sparse row. + arma::umat locations(2, data.n_nonzero); + arma::Col vals(data.n_nonzero); + ElemType lastVal = -std::numeric_limits::max(); + size_t padding = 0; + + for (size_t ii = 0; ii < indexVec.size(); ++ii) + { + const ElemType newVal = indexVec[ii]; + if (lastVal < ElemType(0) && newVal > ElemType(0)) + { + assert(padding == 0); // we should arrive here once! + padding = data.n_elem - data.n_nonzero; + } + + locations.at(0, ii) = (ii + padding) % data.n_rows; + locations.at(1, ii) = (ii + padding) / data.n_rows; + vals.at(ii) = lastVal = newVal; + } + + return SpMat(locations, vals, data.n_rows, data.n_cols, false, false); + }; + +}; + namespace detail { template @@ -57,44 +96,21 @@ namespace detail return dimVec; } -// template -// std::vector > ExtractSortedRow(const arma::SpMat& data, -// size_t dim, -// size_t start, -// size_t end, -// size_t padding) -// { -// typedef SortedItem SortedType; -// -// assert(padding > 0); -// assert(start < end); -// -// arma::SpRow dimVec = data(dim, arma::span(start, end - 1)); -// typename arma::SpRow::iterator dimVecEnd = dimVec.end(); -// -// // Build the vector to be sorted with values in ascending order. -// std::vector sortedDim = std::vector(); -// sortedDim.reserve(dimVec.n_elem / 2); -// -// // Prepare these for the iteration. -// ElemType lastVal = 0; -// --padding; -// -// // Iterate over the row and put only different values, also skipping -// // `padding` number of elements from both sides of the row. -// for (typename MatType::row_col_iterator di = dimVec.begin_row_col(); di != dimVecEnd; ++di) -// { -// if (di.col() < padding || di.col() >= dimVec.n_elem - padding) -// continue; -// // if (*di == lastVal && sortedDim.size() > 0) -// // continue; -// -// sortedDim.push_back(lastVal = *di); -// } -// -// std::sort(sortedDim.begin(), sortedDim.end()); -// return sortedDim; -// } + /** + * We need a custom implementation for sparse matrix, in order to save sorting of + * all these zeroes. + */ + template + arma::SpRow ExtractSortedRow(const arma::SpMat& data, + size_t dim, + size_t start, + size_t end) + { + assert(start < end); + + arma::SpRow dimVec = data(dim, arma::span(start, end - 1)); + return arma::sort(dimVec); + } }; @@ -314,20 +330,24 @@ bool DTree::FindSplit(const MatType& data, typename MatType::row_col_iterator dimVecEnd = dimVec.end_row_col(); typename MatType::row_col_iterator dI = dimVec.begin_row_col(); - // Find the best split for this dimension. We need to figure out why - // there are spikes if this minLeafSize is enforced here... + // Find the best split for this dimension. for (;;) { const size_t position = dI.col(); + // Ensure the minimum leaf size on both sides. We need to figure out why + // there are spikes if this minLeafSize is enforced here... if (position >= dimVec.n_cols - minLeafSize) break; if (position < minLeafSize - 1) continue; ElemType split = *dI; + + // In case of sparse matrices and all present values being negative + // this can happen - last many elements being 0, so we need to check. if (++dI == dimVecEnd) - break; // This means we have same values till the end => No split. + break; // This makes sense for real continuous data. This kinda corrupts the // data and estimation if the data is ordinal. From dc8d9b1c7170af2f891538a8585d0494381430c0 Mon Sep 17 00:00:00 2001 From: theJonan Date: Mon, 17 Oct 2016 22:55:41 +0300 Subject: [PATCH 17/25] - Sparse DTree finally working. --- src/mlpack/methods/det/dtree.hpp | 25 +-- src/mlpack/methods/det/dtree_impl.hpp | 256 +++++++++++--------------- 2 files changed, 117 insertions(+), 164 deletions(-) diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index 9a2b1751f87..70c8cc5215c 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -44,8 +44,9 @@ class DTree /** * The actual, underlying type we're working with */ - typedef typename MatType::elem_type ElemType; - typedef typename MatType::vec_type VecType; + typedef typename MatType::elem_type ElemType; + typedef typename MatType::vec_type VecType; + typedef typename arma::Col StatType; /** * Create an empty density estimation tree. @@ -60,8 +61,8 @@ class DTree * @param minVals Minimum values of the bounding box. * @param totalPoints Total number of points in the dataset. */ - DTree(const VecType& maxVals, - const VecType& minVals, + DTree(const StatType& maxVals, + const StatType& minVals, const size_t totalPoints); /** @@ -86,8 +87,8 @@ class DTree * @param end End of points represented by this node in the data matrix. * @param error log-negative error of this node. */ - DTree(const VecType& maxVals, - const VecType& minVals, + DTree(const StatType& maxVals, + const StatType& minVals, const size_t start, const size_t end, const double logNegError); @@ -103,8 +104,8 @@ class DTree * @param start Start of points represented by this node in the data matrix. * @param end End of points represented by this node in the data matrix. */ - DTree(const VecType& maxVals, - const VecType& minVals, + DTree(const StatType& maxVals, + const StatType& minVals, const size_t totalPoints, const size_t start, const size_t end); @@ -199,9 +200,9 @@ class DTree size_t end; //! Upper half of bounding box for this node. - VecType maxVals; + StatType maxVals; //! Lower half of bounding box for this node. - VecType minVals; + StatType minVals; //! The splitting dimension for this node. size_t splitDim; @@ -270,10 +271,10 @@ class DTree TagType BucketTag() const { return subtreeLeaves == 1 ? bucketTag : -1; } //! Return the maximum values. - const VecType& MaxVals() const { return maxVals; } + const StatType& MaxVals() const { return maxVals; } //! Return the minimum values. - const VecType& MinVals() const { return minVals; } + const StatType& MinVals() const { return minVals; } /** * Serialize the density estimation tree. diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 16092b8034e..ad54039274a 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -5,113 +5,98 @@ * Implementations of some declared functions in * the Density Estimation Tree class. * + * Sparsification and optimizations. + * @author Ivan Georgiev (ivan@jonan.info) + * */ #include "dtree.hpp" #include -#include #include -#include using namespace mlpack; using namespace det; -namespace arma +namespace details { - template - SpMat sort(const SpMat& data) + /** + * This one sorts and scand the given per-dimension extract and puts all splits + * in a vector, that can easily be iterated afterwards. + */ + template + std::vector> ExtractSplits(RowType& row, size_t minLeafSize) { - typedef std::vector ElemVectorType; + typedef std::pair SplitItem; + std::vector splitVec; - // Construct the vector of values / locations... - ElemVectorType indexVec(data.begin(), data.end()); - - // ... and sort it! - std::sort(indexVec.begin(), indexVec.end()); - - // Now prepare the structures for the batch construction of the - // sorted sparse row. - arma::umat locations(2, data.n_nonzero); - arma::Col vals(data.n_nonzero); - ElemType lastVal = -std::numeric_limits::max(); - size_t padding = 0; + // We want to do that inplace for optimization purposes; + std::sort(row.begin(), row.end()); - for (size_t ii = 0; ii < indexVec.size(); ++ii) + // Ensure the minimum leaf size on both sides. We need to figure out why + // there are spikes if this minLeafSize is enforced here... + for (size_t i = minLeafSize - 1; i < row.n_elem - minLeafSize; ++i) { - const ElemType newVal = indexVec[ii]; - if (lastVal < ElemType(0) && newVal > ElemType(0)) - { - assert(padding == 0); // we should arrive here once! - padding = data.n_elem - data.n_nonzero; - } + // This makes sense for real continuous data. This kinda corrupts the + // data and estimation if the data is ordinal. + const ElemType split = (row[i] + row[i + 1]) / 2.0; - locations.at(0, ii) = (ii + padding) % data.n_rows; - locations.at(1, ii) = (ii + padding) / data.n_rows; - vals.at(ii) = lastVal = newVal; + // Check if we can split here (two points are different) + if (split != row[i]) + splitVec.push_back(SplitItem(split, i)); } - return SpMat(locations, vals, data.n_rows, data.n_cols, false, false); - }; + return splitVec; + + } -}; - -namespace detail -{ + // This the custom, sparse optimized implementation of the same routine. template - class DTreeSplit + std::vector> ExtractSplits(arma::SpRow& row, size_t minLeafSize) { - public: - typedef DTreeSplit SplitInfo; - - template - static bool AssignToLeftNode(const VecType& point, - const SplitInfo& splitInfo) - { - return point[splitInfo.splitDimension] < splitInfo.splitVal; - } + typedef std::pair SplitItem; + std::vector splitVec; - private: - ElemType splitVal; - size_t splitDimension; - }; - - /** - * Get the values for the dimension and sort them. The old implementation: - * dimVec = data.row(dim).subvec(start, end - 1); - * dimVec = arma::sort(dimVec); - * was quite inefficient, due to many (3) vector copy operations. This could be a - * problem especially for sparse matrices. That's why they have custom implementation. - */ - template - typename MatType::row_type ExtractSortedRow(const MatType& data, - size_t dim, - size_t start, - size_t end) - { - typedef typename MatType::elem_type ElemType; + // Construct a vector of values. + std::vector valsVec(row.begin(), row.end()); - assert(start < end); + // ... and sort it! + std::sort(valsVec.begin(), valsVec.end()); - typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1)); - std::sort(dimVec.begin(), dimVec.end()); - return dimVec; - } - - /** - * We need a custom implementation for sparse matrix, in order to save sorting of - * all these zeroes. - */ - template - arma::SpRow ExtractSortedRow(const arma::SpMat& data, - size_t dim, - size_t start, - size_t end) - { - assert(start < end); + // Now iterate over the values, taking account for the over-the-zeroes + // jump and construct the splits vector. + ElemType lastVal = -std::numeric_limits::max(); + size_t padding = 0, zeroes = row.n_elem - row.n_nonzero; + + for (size_t i = 0; i < valsVec.size(); ++i) + { + const ElemType newVal = valsVec[i]; + if (lastVal < ElemType(0) && newVal > ElemType(0) && zeroes > 0) + { + Log::Assert(padding == 0); // we should arrive here once! + if (lastVal >= valsVec[0] && // i.e. we're not in the beginning + i >= minLeafSize && + i <= row.n_elem - minLeafSize) + splitVec.push_back(SplitItem(lastVal / 2.0, i - 1)); + + padding = zeroes; + lastVal = ElemType(0); + } + + if (i + padding >= minLeafSize && i + padding <= row.n_elem - minLeafSize)// the normal case + { + // This makes sense for real continuous data. This kinda corrupts the + // data and estimation if the data is ordinal. + const ElemType split = (lastVal + newVal) / 2.0; + + // Check if we can split here (two points are different) + if (split != newVal) + splitVec.push_back(SplitItem(split, i + padding - 1)); + } + + lastVal = newVal; + } - arma::SpRow dimVec = data(dim, arma::span(start, end - 1)); - return arma::sort(dimVec); + return splitVec; } - }; template @@ -136,8 +121,8 @@ DTree::DTree() : // Root node initializers template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, +DTree::DTree(const StatType& maxVals, + const StatType& minVals, const size_t totalPoints) : start(0), end(totalPoints), @@ -161,6 +146,8 @@ template DTree::DTree(MatType & data) : start(0), end(data.n_cols), + minVals(arma::min(data, 1)), + maxVals(arma::max(data, 1)), splitDim(size_t(-1)), splitValue(std::numeric_limits::max()), subtreeLeavesLogNegError(-DBL_MAX), @@ -173,28 +160,13 @@ DTree::DTree(MatType & data) : left(NULL), right(NULL) { - maxVals = data.col(0); - minVals = data.col(0); - - typename MatType::row_col_iterator dataEnd = data.end_row_col(); - - // Loop over data to extract maximum and minimum values in each dimension. - for (typename MatType::row_col_iterator i = data.begin_row_col(); i != dataEnd; ++i) - { - size_t j = i.row(); - if (*i > maxVals[j]) - maxVals[j] = *i; - else if (*i < minVals[j]) - minVals[j] = *i; - } - logNegError = LogNegativeError(data.n_cols); } // Non-root node initializers template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, +DTree::DTree(const StatType& maxVals, + const StatType& minVals, const size_t start, const size_t end, const double logNegError) : @@ -217,8 +189,8 @@ DTree::DTree(const VecType& maxVals, { /* Nothing to do. */ } template -DTree::DTree(const VecType& maxVals, - const VecType& minVals, +DTree::DTree(const StatType& maxVals, + const StatType& minVals, const size_t totalPoints, const size_t start, const size_t end) : @@ -256,13 +228,12 @@ double DTree::LogNegativeError(const size_t totalPoints) const double err = 2 * std::log((double) (end - start)) - 2 * std::log((double) totalPoints); - VecType valDiffs = maxVals - minVals; - typename VecType::iterator valEnd = valDiffs.end(); - for (typename VecType::iterator i = valDiffs.begin(); i != valEnd; ++i) + StatType valDiffs = maxVals - minVals; + for (size_t i = 0;i < valDiffs.n_elem; ++i) { // Ignore very small dimensions to prevent overflow. - if (*i > 1e-50) - err -= std::log(*i); + if (valDiffs[i] > 1e-50) + err -= std::log(valDiffs[i]); } return err; @@ -279,10 +250,12 @@ bool DTree::FindSplit(const MatType& data, double& rightError, const size_t minLeafSize) const { + typedef std::pair SplitItem; + // Ensure the dimensionality of the data is the same as the dimensionality of // the bounding rectangle. - assert(data.n_rows == maxVals.n_elem); - assert(data.n_rows == minVals.n_elem); + Log::Assert(data.n_rows == maxVals.n_elem); + Log::Assert(data.n_rows == minVals.n_elem); const size_t points = end - start; @@ -319,44 +292,24 @@ bool DTree::FindSplit(const MatType& data, // Find the log volume of all the other dimensions. double volumeWithoutDim = logVolume - std::log(max - min); - - // Get a sorted version of the dimension in interest, - // from the given samples range. This is the most expensive step. - typename MatType::row_type dimVec = detail::ExtractSortedRow(data, - dim, - start, - end); - - typename MatType::row_col_iterator dimVecEnd = dimVec.end_row_col(); - typename MatType::row_col_iterator dI = dimVec.begin_row_col(); - // Find the best split for this dimension. - for (;;) - { - const size_t position = dI.col(); - - // Ensure the minimum leaf size on both sides. We need to figure out why - // there are spikes if this minLeafSize is enforced here... - if (position >= dimVec.n_cols - minLeafSize) - break; - if (position < minLeafSize - 1) - continue; - - ElemType split = *dI; + // Get the values for splitting. The old implementation: + // dimVec = data.row(dim).subvec(start, end - 1); + // dimVec = arma::sort(dimVec); + // could be quite inefficient for sparse matrices, due to copy operations (3). + // This one has custom implementation for dense and sparse matrices. - // In case of sparse matrices and all present values being negative - // this can happen - last many elements being 0, so we need to check. - if (++dI == dimVecEnd) - break; - - // This makes sense for real continuous data. This kinda corrupts the - // data and estimation if the data is ordinal. - split += *dI; - split /= 2.0; + typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1)); + std::vector splitVec = details::ExtractSplits(dimVec, minLeafSize); + + // Iterate on all the splits for this dimension + for (typename std::vector::iterator i = splitVec.begin(); + i != splitVec.end(); + ++i) + { + const ElemType split = i->first; + const size_t position = i->second; - if (split == *dI) - continue; // We can't split here (two points are the same). - // Another way of picking split is using this: // split = leftsplit; if ((split - min > 0.0) && (max - split > 0.0)) @@ -475,14 +428,13 @@ double DTree::Grow(MatType& data, { // Move the data around for the children to have points in a node lie // contiguously (to increase efficiency during the training). -// const size_t splitIndex = splt::PerformSplit(data, start, end - start, ) const size_t splitIndex = SplitData(data, dim, splitValueTmp, oldFromNew); // Make max and min vals for the children. - arma::vec maxValsL(maxVals); - arma::vec maxValsR(maxVals); - arma::vec minValsL(minVals); - arma::vec minValsR(minVals); + StatType maxValsL(maxVals); + StatType maxValsR(maxVals); + StatType minValsL(minVals); + StatType minValsR(minVals); maxValsL[dim] = splitValueTmp; minValsR[dim] = splitValueTmp; @@ -524,7 +476,7 @@ double DTree::Grow(MatType& data, else { // We can make this a leaf node. - assert((size_t) (end - start) >= minLeafSize); + Log::Assert((size_t) (end - start) >= minLeafSize); subtreeLeaves = 1; subtreeLeavesLogNegError = logNegError; } From 1fee500a5dd7c12ffac48bcd2d5bc58a2ed9ce75 Mon Sep 17 00:00:00 2001 From: theJonan Date: Tue, 18 Oct 2016 11:54:50 +0300 Subject: [PATCH 18/25] - All DET sparsification works. --- src/mlpack/methods/det/dt_utils_impl.hpp | 29 +++-- src/mlpack/methods/det/dtree.hpp | 2 +- src/mlpack/methods/det/dtree_impl.hpp | 9 +- src/mlpack/tests/det_test.cpp | 137 +++++++++++++++++++++++ 4 files changed, 156 insertions(+), 21 deletions(-) diff --git a/src/mlpack/methods/det/dt_utils_impl.hpp b/src/mlpack/methods/det/dt_utils_impl.hpp index cad52891554..fcb6b0cbbd4 100644 --- a/src/mlpack/methods/det/dt_utils_impl.hpp +++ b/src/mlpack/methods/det/dt_utils_impl.hpp @@ -99,11 +99,11 @@ void mlpack::det::PrintVariableImportance(const DTree* dtree, // folds. template DTree* mlpack::det::Trainer(MatType& dataset, - const size_t folds, - const bool useVolumeReg, - const size_t maxLeafSize, - const size_t minLeafSize, - const std::string unprunedTreeOutput) + const size_t folds, + const bool useVolumeReg, + const size_t maxLeafSize, + const size_t minLeafSize, + const std::string unprunedTreeOutput) { // Initialize the tree. DTree dtree(dataset); @@ -170,7 +170,7 @@ DTree* mlpack::det::Trainer(MatType& dataset, << " " << oldAlpha << "." << std::endl; MatType cvData(dataset); - size_t testSize = dataset.n_cols / folds; + const size_t testSize = dataset.n_cols / folds; arma::vec regularizationConstants(prunedSequence.size()); regularizationConstants.fill(0.0); @@ -181,17 +181,17 @@ DTree* mlpack::det::Trainer(MatType& dataset, // implementation. #ifdef _WIN32 #pragma omp parallel for default(none) \ - shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) + shared(testSize, cvData, prunedSequence, regularizationConstants) for (intmax_t fold = 0; fold < (intmax_t) folds; fold++) #else #pragma omp parallel for default(none) \ - shared(testSize, cvData, prunedSequence, regularizationConstants, dataset) + shared(testSize, cvData, prunedSequence, regularizationConstants) for (size_t fold = 0; fold < folds; fold++) #endif { // Break up data into train and test sets. - size_t start = fold * testSize; - size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols); + const size_t start = fold * testSize; + const size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols); MatType test = cvData.cols(start, end - 1); MatType train(cvData.n_rows, cvData.n_cols - test.n_cols); @@ -239,11 +239,10 @@ DTree* mlpack::det::Trainer(MatType& dataset, } // Update the cv regularization constant. - cvRegularizationConstants[i] += 2.0 * cvVal / (double) dataset.n_cols; + cvRegularizationConstants[i] += 2.0 * cvVal / (double) cvData.n_cols; // Determine the new alpha value and prune accordingly. - double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first + - prunedSequence[i + 2].first); + double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first + prunedSequence[i + 2].first); cvDTree.PruneAndUpdate(cvOldAlpha, train.n_cols, useVolumeReg); } @@ -256,9 +255,9 @@ DTree* mlpack::det::Trainer(MatType& dataset, } if (prunedSequence.size() > 2) - cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) dataset.n_cols; + cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) cvData.n_cols; - #pragma omp critical + #pragma omp critical (DTreeCVUpdate) regularizationConstants += cvRegularizationConstants; } Timer::Stop("cross_validation"); diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp index 70c8cc5215c..a4ec63a87dd 100644 --- a/src/mlpack/methods/det/dtree.hpp +++ b/src/mlpack/methods/det/dtree.hpp @@ -46,7 +46,7 @@ class DTree */ typedef typename MatType::elem_type ElemType; typedef typename MatType::vec_type VecType; - typedef typename arma::Col StatType; + typedef typename arma::Col StatType; /** * Create an empty density estimation tree. diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index ad54039274a..be64f5c7fda 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -146,8 +146,8 @@ template DTree::DTree(MatType & data) : start(0), end(data.n_cols), - minVals(arma::min(data, 1)), maxVals(arma::max(data, 1)), + minVals(arma::min(data, 1)), splitDim(size_t(-1)), splitValue(std::numeric_limits::max()), subtreeLeavesLogNegError(-DBL_MAX), @@ -264,11 +264,11 @@ bool DTree::FindSplit(const MatType& data, // Loop through each dimension. #ifdef _WIN32 - #pragma omp parallel for default(shared) \ + #pragma omp parallel for default(none) \ shared(splitValue, splitDim, data) for (intmax_t dim = 0; dim < (intmax_t) maxVals.n_elem; ++dim) #else - #pragma omp parallel for default(shared) \ + #pragma omp parallel for default(none) \ shared(splitValue, splitDim, data) for (size_t dim = 0; dim < maxVals.n_elem; ++dim) #endif @@ -341,10 +341,9 @@ bool DTree::FindSplit(const MatType& data, double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; -#pragma omp atomic +#pragma omp critical (DTreeFindUpdate) if ((actualMinDimError > minError) && dimSplitFound) { -#pragma omp critical DTreeFindUpdate { // Calculate actual error (in logspace) by adding terms back to our // estimate. diff --git a/src/mlpack/tests/det_test.cpp b/src/mlpack/tests/det_test.cpp index 33659842f19..09b96918ca6 100644 --- a/src/mlpack/tests/det_test.cpp +++ b/src/mlpack/tests/det_test.cpp @@ -141,6 +141,70 @@ BOOST_AUTO_TEST_CASE(TestSplitData) BOOST_REQUIRE_EQUAL(oTest[3], 2); BOOST_REQUIRE_EQUAL(oTest[4], 5); } + +BOOST_AUTO_TEST_CASE(TestSparseFindSplit) +{ + arma::mat realData(4,7); + + realData << .0 << 4 << 5 << 7 << 0 << 5 << 0 << arma::endr + << .0 << 5 << 0 << 0 << 1 << 7 << 1 << arma::endr + << .0 << 5 << 6 << 7 << 1 << 0 << 8 << arma::endr + << -1 << 2 << 5 << 0 << 0 << 0 << 0 << arma::endr; + + arma::sp_mat testData(realData); + + DTree testDTree(testData); + + size_t obDim, trueDim; + double trueLeftError, obLeftError, trueRightError, obRightError, obSplit, trueSplit; + + trueDim = 1; + trueSplit = .5; + trueLeftError = 2 * log(3.0 / 7.0) - (log(7.0) + log(0.5) + log(8.0) + log(6.0)); + trueRightError = 2 * log(4.0 / 7.0) - (log(7.0) + log(6.5) + log(8.0) + log(6.0)); + + testDTree.logVolume = log(7.0) + log(7.0) + log(8.0) + log(6.0); + BOOST_REQUIRE(testDTree.FindSplit(testData, obDim, obSplit, obLeftError, obRightError, 1)); + + BOOST_REQUIRE(trueDim == obDim); + BOOST_REQUIRE_CLOSE(trueSplit, obSplit, 1e-10); + + BOOST_REQUIRE_CLOSE(trueLeftError, obLeftError, 1e-10); + BOOST_REQUIRE_CLOSE(trueRightError, obRightError, 1e-10); +} + +BOOST_AUTO_TEST_CASE(TestSparseSplitData) +{ + arma::mat realData(4,7); + + realData << .0 << 4 << 5 << 7 << 0 << 5 << 0 << arma::endr + << .0 << 5 << 0 << 0 << 1 << 7 << 1 << arma::endr + << .0 << 5 << 6 << 7 << 1 << 0 << 8 << arma::endr + << -1 << 2 << 5 << 0 << 0 << 0 << 0 << arma::endr; + + arma::sp_mat testData(realData); + + DTree testDTree(testData); + + arma::Col oTest(7); + oTest << 1 << 2 << 3 << 4 << 5 << 6 << 7; + + size_t splitDim = 1; + double trueSplitVal = .5; + + size_t splitInd = testDTree.SplitData(testData, splitDim, trueSplitVal, oTest); + + BOOST_REQUIRE_EQUAL(splitInd, 3); // 2 points on left side. + + BOOST_REQUIRE_EQUAL(oTest[0], 1); + BOOST_REQUIRE_EQUAL(oTest[1], 4); + BOOST_REQUIRE_EQUAL(oTest[2], 3); + BOOST_REQUIRE_EQUAL(oTest[3], 2); + BOOST_REQUIRE_EQUAL(oTest[4], 5); + BOOST_REQUIRE_EQUAL(oTest[5], 6); + BOOST_REQUIRE_EQUAL(oTest[6], 7); +} + #endif // Tests for the public functions. @@ -307,6 +371,79 @@ BOOST_AUTO_TEST_CASE(TestVariableImportance) BOOST_REQUIRE_CLOSE((double) (rootError - (lError + rError)), imps[2], 1e-10); } +BOOST_AUTO_TEST_CASE(TestSparsePruneAndUpdate) +{ + arma::mat realData(3, 5); + + realData << 4 << 5 << 7 << 3 << 5 << arma::endr + << 5 << 0 << 1 << 7 << 1 << arma::endr + << 5 << 6 << 7 << 1 << 8 << arma::endr; + + arma::sp_mat testData(realData); + + arma::Col oTest(5); + oTest << 0 << 1 << 2 << 3 << 4; + + DTree testDTree(testData); + double alpha = testDTree.Grow(testData, oTest, false, 2, 1); + alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false); + + BOOST_REQUIRE_CLOSE(alpha, numeric_limits::max(), 1e-10); + BOOST_REQUIRE(testDTree.SubtreeLeaves() == 1); + + double rootError = -log(4.0) - log(7.0) - log(7.0); + + BOOST_REQUIRE_CLOSE(testDTree.LogNegError(), rootError, 1e-10); + BOOST_REQUIRE_CLOSE(testDTree.SubtreeLeavesLogNegError(), rootError, 1e-10); + BOOST_REQUIRE(testDTree.Left() == NULL); + BOOST_REQUIRE(testDTree.Right() == NULL); +} + +BOOST_AUTO_TEST_CASE(TestSparseComputeValue) +{ + arma::mat realData(3, 5); + + Log::Info << "OMP threads: " << omp_get_thread_num() << std::endl; + + realData << 4 << 5 << 7 << 3 << 5 << arma::endr + << 5 << 0 << 1 << 7 << 1 << arma::endr + << 5 << 6 << 7 << 1 << 8 << arma::endr; + + arma::vec _q1(3), _q2(3), _q3(3), _q4(3); + + _q1 << 4 << 2 << 2; + _q2 << 5 << 0.25 << 6; + _q3 << 5 << 3 << 7; + _q4 << 2 << 3 << 3; + + arma::sp_mat testData(realData); + arma::sp_vec q1(_q1), q2(_q2), q3(_q3), q4(_q4); + + arma::Col oTest(5); + oTest << 0 << 1 << 2 << 3 << 4; + + DTree testDTree(testData); + double alpha = testDTree.Grow(testData, oTest, false, 2, 1); + + double d1 = (2.0 / 5.0) / exp(log(4.0) + log(7.0) + log(4.5)); + double d2 = (1.0 / 5.0) / exp(log(4.0) + log(0.5) + log(2.5)); + double d3 = (2.0 / 5.0) / exp(log(4.0) + log(6.5) + log(2.5)); + + BOOST_REQUIRE_CLOSE(d1, testDTree.ComputeValue(q1), 1e-10); + BOOST_REQUIRE_CLOSE(d2, testDTree.ComputeValue(q2), 1e-10); + BOOST_REQUIRE_CLOSE(d3, testDTree.ComputeValue(q3), 1e-10); + BOOST_REQUIRE_CLOSE(0.0, testDTree.ComputeValue(q4), 1e-10); + + alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false); + + double d = 1.0 / exp(log(4.0) + log(7.0) + log(7.0)); + + BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q1), 1e-10); + BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q2), 1e-10); + BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q3), 1e-10); + BOOST_REQUIRE_CLOSE(0.0, testDTree.ComputeValue(q4), 1e-10); +} + /** * These are not yet implemented. * From 0d0e387e9f681a5a1263013eae95d6f44f4a21f8 Mon Sep 17 00:00:00 2001 From: theJonan Date: Tue, 18 Oct 2016 12:20:03 +0300 Subject: [PATCH 19/25] - OMP test line removed. --- src/mlpack/tests/det_test.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mlpack/tests/det_test.cpp b/src/mlpack/tests/det_test.cpp index 09b96918ca6..4556d7d689a 100644 --- a/src/mlpack/tests/det_test.cpp +++ b/src/mlpack/tests/det_test.cpp @@ -403,8 +403,6 @@ BOOST_AUTO_TEST_CASE(TestSparseComputeValue) { arma::mat realData(3, 5); - Log::Info << "OMP threads: " << omp_get_thread_num() << std::endl; - realData << 4 << 5 << 7 << 3 << 5 << arma::endr << 5 << 0 << 1 << 7 << 1 << arma::endr << 5 << 6 << 7 << 1 << 8 << arma::endr; From 24f6d8c57914a382f536beb2dc3a8ef097aab52d Mon Sep 17 00:00:00 2001 From: theJonan Date: Tue, 18 Oct 2016 17:29:31 +0300 Subject: [PATCH 20/25] - Fixed openmp declarations. - Copyright notes added. --- COPYRIGHT.txt | 1 + src/mlpack/core.hpp | 1 + src/mlpack/methods/det/dt_utils_impl.hpp | 4 ++-- src/mlpack/methods/det/dtree_impl.hpp | 6 ++---- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt index fd38ca1b4cc..ed1925f9bcc 100644 --- a/COPYRIGHT.txt +++ b/COPYRIGHT.txt @@ -61,6 +61,7 @@ Copyright: Copyright 2016, Nilay Jain Copyright 2016, Peter Lehner Copyright 2016, Anuraj Kanodia + Copyright 2016, Ivan Georgiev License: BSD-3-clause All rights reserved. . diff --git a/src/mlpack/core.hpp b/src/mlpack/core.hpp index a5bca73b2af..3e056fd6bb7 100644 --- a/src/mlpack/core.hpp +++ b/src/mlpack/core.hpp @@ -195,6 +195,7 @@ * - Nilay Jain * - Peter Lehner * - Anuraj Kanodia + * - Ivan Georgiev */ // First, include all of the prerequisites. diff --git a/src/mlpack/methods/det/dt_utils_impl.hpp b/src/mlpack/methods/det/dt_utils_impl.hpp index fcb6b0cbbd4..6a798c4e7e5 100644 --- a/src/mlpack/methods/det/dt_utils_impl.hpp +++ b/src/mlpack/methods/det/dt_utils_impl.hpp @@ -181,11 +181,11 @@ DTree* mlpack::det::Trainer(MatType& dataset, // implementation. #ifdef _WIN32 #pragma omp parallel for default(none) \ - shared(testSize, cvData, prunedSequence, regularizationConstants) + shared(cvData, prunedSequence, regularizationConstants) for (intmax_t fold = 0; fold < (intmax_t) folds; fold++) #else #pragma omp parallel for default(none) \ - shared(testSize, cvData, prunedSequence, regularizationConstants) + shared(cvData, prunedSequence, regularizationConstants) for (size_t fold = 0; fold < folds; fold++) #endif { diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index be64f5c7fda..92f907824c2 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -264,12 +264,10 @@ bool DTree::FindSplit(const MatType& data, // Loop through each dimension. #ifdef _WIN32 - #pragma omp parallel for default(none) \ - shared(splitValue, splitDim, data) + #pragma omp parallel for default(shared) for (intmax_t dim = 0; dim < (intmax_t) maxVals.n_elem; ++dim) #else - #pragma omp parallel for default(none) \ - shared(splitValue, splitDim, data) + #pragma omp parallel for default(shared) for (size_t dim = 0; dim < maxVals.n_elem; ++dim) #endif { From 4514e4c9f9914feaced2b437c5a5b297fa2f1353 Mon Sep 17 00:00:00 2001 From: theJonan Date: Wed, 19 Oct 2016 18:29:58 +0300 Subject: [PATCH 21/25] - Sparse matrix speed-ups. --- src/mlpack/methods/det/dtree_impl.hpp | 59 +++++++++++++++++++-------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 92f907824c2..f14961e6ca1 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -22,25 +22,34 @@ namespace details * This one sorts and scand the given per-dimension extract and puts all splits * in a vector, that can easily be iterated afterwards. */ - template - std::vector> ExtractSplits(RowType& row, size_t minLeafSize) + template + std::vector> + ExtractSplits(const MatType& data, + size_t dim, + size_t start, + size_t end, + size_t minLeafSize) { + typedef typename MatType::elem_type ElemType; typedef std::pair SplitItem; - std::vector splitVec; + typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1)); + + // We sort these, in-place (it's a copy of the data, anyways). + std::sort(dimVec.begin(), dimVec.end()); - // We want to do that inplace for optimization purposes; - std::sort(row.begin(), row.end()); + // We're going to collect results here. + std::vector splitVec; // Ensure the minimum leaf size on both sides. We need to figure out why // there are spikes if this minLeafSize is enforced here... - for (size_t i = minLeafSize - 1; i < row.n_elem - minLeafSize; ++i) + for (size_t i = minLeafSize - 1; i < dimVec.n_elem - minLeafSize; ++i) { // This makes sense for real continuous data. This kinda corrupts the // data and estimation if the data is ordinal. - const ElemType split = (row[i] + row[i + 1]) / 2.0; + const ElemType split = (dimVec[i] + dimVec[i + 1]) / 2.0; // Check if we can split here (two points are different) - if (split != row[i]) + if (split != dimVec[i]) splitVec.push_back(SplitItem(split, i)); } @@ -50,21 +59,34 @@ namespace details // This the custom, sparse optimized implementation of the same routine. template - std::vector> ExtractSplits(arma::SpRow& row, size_t minLeafSize) + std::vector> + ExtractSplits(const arma::SpMat& data, + size_t dim, + size_t start, + size_t end, + size_t minLeafSize) { + typedef typename arma::SpMat::const_row_iterator RowIterator; typedef std::pair SplitItem; - std::vector splitVec; + const size_t n_elem = end - start; // Construct a vector of values. - std::vector valsVec(row.begin(), row.end()); + std::vector valsVec; + valsVec.reserve(n_elem); + + for (RowIterator j(data, dim, start);j.col() < end; ++j) + valsVec.push_back(*j); // ... and sort it! std::sort(valsVec.begin(), valsVec.end()); - + + // We're going to collect our splits here. + std::vector splitVec; + // Now iterate over the values, taking account for the over-the-zeroes // jump and construct the splits vector. ElemType lastVal = -std::numeric_limits::max(); - size_t padding = 0, zeroes = row.n_elem - row.n_nonzero; + size_t padding = 0, zeroes = n_elem - valsVec.size(); for (size_t i = 0; i < valsVec.size(); ++i) { @@ -74,14 +96,14 @@ namespace details Log::Assert(padding == 0); // we should arrive here once! if (lastVal >= valsVec[0] && // i.e. we're not in the beginning i >= minLeafSize && - i <= row.n_elem - minLeafSize) + i <= n_elem - minLeafSize) splitVec.push_back(SplitItem(lastVal / 2.0, i - 1)); padding = zeroes; lastVal = ElemType(0); } - if (i + padding >= minLeafSize && i + padding <= row.n_elem - minLeafSize)// the normal case + if (i + padding >= minLeafSize && i + padding <= n_elem - minLeafSize)// the normal case { // This makes sense for real continuous data. This kinda corrupts the // data and estimation if the data is ordinal. @@ -297,8 +319,11 @@ bool DTree::FindSplit(const MatType& data, // could be quite inefficient for sparse matrices, due to copy operations (3). // This one has custom implementation for dense and sparse matrices. - typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1)); - std::vector splitVec = details::ExtractSplits(dimVec, minLeafSize); + std::vector splitVec = details::ExtractSplits(data, + dim, + start, + end, + minLeafSize); // Iterate on all the splits for this dimension for (typename std::vector::iterator i = splitVec.begin(); From 8d26bb6b177ebaa9d2375705243df4ed837f4f19 Mon Sep 17 00:00:00 2001 From: theJonan Date: Wed, 19 Oct 2016 22:36:06 +0300 Subject: [PATCH 22/25] - Fix of sparse iteration --- src/mlpack/methods/det/dtree_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index f14961e6ca1..4c1b9f35171 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -74,7 +74,7 @@ namespace details std::vector valsVec; valsVec.reserve(n_elem); - for (RowIterator j(data, dim, start);j.col() < end; ++j) + for (RowIterator j(data, dim, start);j.row() == dim && j.col() < end; ++j) valsVec.push_back(*j); // ... and sort it! From 50fa9312175a6f44a1f9bb211c8dc80ccb8a0492 Mon Sep 17 00:00:00 2001 From: theJonan Date: Thu, 20 Oct 2016 00:36:02 +0300 Subject: [PATCH 23/25] - Rowback to faster sparse iteration. --- src/mlpack/methods/det/dtree_impl.hpp | 50 +++++++++------------------ 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 4c1b9f35171..1d2cb7671a7 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -23,12 +23,12 @@ namespace details * in a vector, that can easily be iterated afterwards. */ template - std::vector> - ExtractSplits(const MatType& data, - size_t dim, - size_t start, - size_t end, - size_t minLeafSize) + void ExtractSplits(std::vector>& splitVec, + const MatType& data, + size_t dim, + size_t start, + size_t end, + size_t minLeafSize) { typedef typename MatType::elem_type ElemType; typedef std::pair SplitItem; @@ -37,9 +37,6 @@ namespace details // We sort these, in-place (it's a copy of the data, anyways). std::sort(dimVec.begin(), dimVec.end()); - // We're going to collect results here. - std::vector splitVec; - // Ensure the minimum leaf size on both sides. We need to figure out why // there are spikes if this minLeafSize is enforced here... for (size_t i = minLeafSize - 1; i < dimVec.n_elem - minLeafSize; ++i) @@ -52,37 +49,27 @@ namespace details if (split != dimVec[i]) splitVec.push_back(SplitItem(split, i)); } - - return splitVec; - } // This the custom, sparse optimized implementation of the same routine. template - std::vector> - ExtractSplits(const arma::SpMat& data, - size_t dim, - size_t start, - size_t end, - size_t minLeafSize) + void ExtractSplits(std::vector>& splitVec, + const arma::SpMat& data, + size_t dim, + size_t start, + size_t end, + size_t minLeafSize) { - typedef typename arma::SpMat::const_row_iterator RowIterator; typedef std::pair SplitItem; const size_t n_elem = end - start; // Construct a vector of values. - std::vector valsVec; - valsVec.reserve(n_elem); - - for (RowIterator j(data, dim, start);j.row() == dim && j.col() < end; ++j) - valsVec.push_back(*j); + const arma::SpRow row = data(dim, arma::span(start, end - 1)); + std::vector valsVec(row.begin(), row.end()); // ... and sort it! std::sort(valsVec.begin(), valsVec.end()); - // We're going to collect our splits here. - std::vector splitVec; - // Now iterate over the values, taking account for the over-the-zeroes // jump and construct the splits vector. ElemType lastVal = -std::numeric_limits::max(); @@ -116,8 +103,6 @@ namespace details lastVal = newVal; } - - return splitVec; } }; @@ -319,11 +304,8 @@ bool DTree::FindSplit(const MatType& data, // could be quite inefficient for sparse matrices, due to copy operations (3). // This one has custom implementation for dense and sparse matrices. - std::vector splitVec = details::ExtractSplits(data, - dim, - start, - end, - minLeafSize); + std::vector splitVec; + details::ExtractSplits(splitVec, data, dim, start, end, minLeafSize); // Iterate on all the splits for this dimension for (typename std::vector::iterator i = splitVec.begin(); From a60ae8a4022ce1154c40b4f321bb7cab28a4663e Mon Sep 17 00:00:00 2001 From: theJonan Date: Thu, 20 Oct 2016 01:07:35 +0300 Subject: [PATCH 24/25] - Fixes, based on PR's comments. - The fastest sparse-matrix utilization so far. --- src/mlpack/methods/det/CMakeLists.txt | 4 ++ src/mlpack/methods/det/dt_utils_impl.hpp | 18 +++++-- src/mlpack/methods/det/dtree_impl.hpp | 62 +++++++++++++++--------- 3 files changed, 55 insertions(+), 29 deletions(-) diff --git a/src/mlpack/methods/det/CMakeLists.txt b/src/mlpack/methods/det/CMakeLists.txt index 4dd3bc32adb..ced69a990b9 100644 --- a/src/mlpack/methods/det/CMakeLists.txt +++ b/src/mlpack/methods/det/CMakeLists.txt @@ -5,6 +5,10 @@ set(SOURCES # the DET class dtree.hpp dtree_impl.hpp + + # Utility files + dt_utils.hpp + dt_utils_impl.hpp ) # add directory name to sources diff --git a/src/mlpack/methods/det/dt_utils_impl.hpp b/src/mlpack/methods/det/dt_utils_impl.hpp index 6a798c4e7e5..191bce14256 100644 --- a/src/mlpack/methods/det/dt_utils_impl.hpp +++ b/src/mlpack/methods/det/dt_utils_impl.hpp @@ -158,7 +158,8 @@ DTree* mlpack::det::Trainer(MatType& dataset, // Some sanity checks. It seems that on some datasets, the error does not // increase as the tree is pruned but instead stays the same---hence the // "<=" in the final assert. - Log::Assert((alpha < std::numeric_limits::max()) || (dtree.SubtreeLeaves() == 1)); + Log::Assert((alpha < std::numeric_limits::max()) + || (dtree.SubtreeLeaves() == 1)); Log::Assert(alpha > oldAlpha); Log::Assert(dtree.SubtreeLeavesLogNegError() <= treeSeq.second); } @@ -191,7 +192,8 @@ DTree* mlpack::det::Trainer(MatType& dataset, { // Break up data into train and test sets. const size_t start = fold * testSize; - const size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols); + const size_t end = std::min((size_t) (fold + 1) + * testSize, (size_t) cvData.n_cols); MatType test = cvData.cols(start, end - 1); MatType train(cvData.n_rows, cvData.n_cols - test.n_cols); @@ -242,7 +244,8 @@ DTree* mlpack::det::Trainer(MatType& dataset, cvRegularizationConstants[i] += 2.0 * cvVal / (double) cvData.n_cols; // Determine the new alpha value and prune accordingly. - double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first + prunedSequence[i + 2].first); + double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first + + prunedSequence[i + 2].first); cvDTree.PruneAndUpdate(cvOldAlpha, train.n_cols, useVolumeReg); } @@ -255,7 +258,8 @@ DTree* mlpack::det::Trainer(MatType& dataset, } if (prunedSequence.size() > 2) - cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) cvData.n_cols; + cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal + / (double) cvData.n_cols; #pragma omp critical (DTreeCVUpdate) regularizationConstants += cvRegularizationConstants; @@ -293,7 +297,11 @@ DTree* mlpack::det::Trainer(MatType& dataset, // Grow the tree. oldAlpha = -DBL_MAX; - alpha = dtreeOpt->Grow(newDataset, oldFromNew, useVolumeReg, maxLeafSize, minLeafSize); + alpha = dtreeOpt->Grow(newDataset, + oldFromNew, + useVolumeReg, + maxLeafSize, + minLeafSize); // Prune with optimal alpha. while ((oldAlpha < optimalAlpha) && (dtreeOpt->SubtreeLeaves() > 1)) diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 1d2cb7671a7..7c868998120 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -23,7 +23,8 @@ namespace details * in a vector, that can easily be iterated afterwards. */ template - void ExtractSplits(std::vector>& splitVec, + void ExtractSplits(std::vector< + std::pair>& splitVec, const MatType& data, size_t dim, size_t start, @@ -90,7 +91,8 @@ namespace details lastVal = ElemType(0); } - if (i + padding >= minLeafSize && i + padding <= n_elem - minLeafSize)// the normal case + // the normal case + if (i + padding >= minLeafSize && i + padding <= n_elem - minLeafSize) { // This makes sense for real continuous data. This kinda corrupts the // data and estimation if the data is ordinal. @@ -278,8 +280,6 @@ bool DTree::FindSplit(const MatType& data, for (size_t dim = 0; dim < maxVals.n_elem; ++dim) #endif { - // Have to deal with REAL, INTEGER, NOMINAL data differently, so we have to - // think of how to do that... const ElemType min = minVals[dim]; const ElemType max = maxVals[dim]; @@ -329,8 +329,10 @@ bool DTree::FindSplit(const MatType& data, // and because the volume is only dependent on the dimension we are // splitting, we can assume V_l is just the range of the left and V_r is // just the range of the right. - double negLeftError = std::pow(position + 1, 2.0) / (split - min); - double negRightError = std::pow(points - position - 1, 2.0) / (max - split); + double negLeftError = std::pow(position + 1, 2.0) + / (split - min); + double negRightError = std::pow(points - position - 1, 2.0) + / (max - split); // If this is better, take it. if ((negLeftError + negRightError) >= minDimError) @@ -344,21 +346,23 @@ bool DTree::FindSplit(const MatType& data, } } - double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; + double actualMinDimError = std::log(minDimError) + - 2 * std::log((double) data.n_cols) + - volumeWithoutDim; #pragma omp critical (DTreeFindUpdate) if ((actualMinDimError > minError) && dimSplitFound) { - { - // Calculate actual error (in logspace) by adding terms back to our - // estimate. - minError = actualMinDimError; - splitDim = dim; - splitValue = dimSplitValue; - leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; - rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim; - splitFound = true; - } + // Calculate actual error (in logspace) by adding terms back to our + // estimate. + minError = actualMinDimError; + splitDim = dim; + splitValue = dimSplitValue; + leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) + - volumeWithoutDim; + rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) + - volumeWithoutDim; + splitFound = true; } // end if better split found in this dimension. } @@ -451,8 +455,10 @@ double DTree::Grow(MatType& data, left = new DTree(maxValsL, minValsL, start, splitIndex, leftError); right = new DTree(maxValsR, minValsR, splitIndex, end, rightError); - leftG = left->Grow(data, oldFromNew, useVolReg, maxLeafSize, minLeafSize); - rightG = right->Grow(data, oldFromNew, useVolReg, maxLeafSize, minLeafSize); + leftG = left->Grow(data, oldFromNew, useVolReg, maxLeafSize, + minLeafSize); + rightG = right->Grow(data, oldFromNew, useVolReg, maxLeafSize, + minLeafSize); // Store values of R(T~) and |T~|. subtreeLeaves = left->SubtreeLeaves() + right->SubtreeLeaves(); @@ -517,12 +523,15 @@ double DTree::Grow(MatType& data, if (right->SubtreeLeaves() > 1) { - const double exponent = 2 * std::log((double) data.n_cols) + logVolume + right->AlphaUpper(); + const double exponent = 2 * std::log((double) data.n_cols) + + logVolume + + right->AlphaUpper(); tmpAlphaSum += std::exp(exponent); } - alphaUpper = std::log(tmpAlphaSum) - 2 * std::log((double) data.n_cols) - logVolume; + alphaUpper = std::log(tmpAlphaSum) - 2 * std::log((double) data.n_cols) + - logVolume; double gT; if (useVolReg) @@ -689,7 +698,9 @@ double DTree::ComputeValue(const VecType& query) const else { // Return either of the two children - left or right, depending on the splitValue - return (query[splitDim] <= splitValue) ? left->ComputeValue(query) : right->ComputeValue(query); + return (query[splitDim] <= splitValue) ? + left->ComputeValue(query) : + right->ComputeValue(query); } return 0.0; @@ -725,12 +736,15 @@ TagType DTree::FindBucket(const VecType& query) const else { // Return the tag from either of the two children - left or right. - return (query[splitDim] <= splitValue) ? left->FindBucket(query) : right->FindBucket(query); + return (query[splitDim] <= splitValue) ? + left->FindBucket(query) : + right->FindBucket(query); } } template -void DTree::ComputeVariableImportance(arma::vec& importances) const +void +DTree::ComputeVariableImportance(arma::vec& importances) const { // Clear and set to right size. importances.zeros(maxVals.n_elem); From 10812ca06558e96878ef301db268e0bcd5883610 Mon Sep 17 00:00:00 2001 From: theJonan Date: Thu, 20 Oct 2016 13:28:17 +0300 Subject: [PATCH 25/25] - Template fixes for ExtractSplits. --- src/mlpack/methods/det/dtree_impl.hpp | 107 +++++++++++++++++--------- 1 file changed, 71 insertions(+), 36 deletions(-) diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp index 7c868998120..a3e1567a497 100644 --- a/src/mlpack/methods/det/dtree_impl.hpp +++ b/src/mlpack/methods/det/dtree_impl.hpp @@ -20,35 +20,66 @@ namespace details { /** * This one sorts and scand the given per-dimension extract and puts all splits - * in a vector, that can easily be iterated afterwards. + * in a vector, that can easily be iterated afterwards. General implementation. */ - template - void ExtractSplits(std::vector< - std::pair>& splitVec, + template + void ExtractSplits(std::vector>& splitVec, const MatType& data, size_t dim, - size_t start, - size_t end, - size_t minLeafSize) + const size_t start, + const size_t end, + const size_t minLeafSize) { - typedef typename MatType::elem_type ElemType; - typedef std::pair SplitItem; - typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1)); + static_assert( + std::is_same::value == true, + "The ElemType does not correspond to the matrix's element type." + ); - // We sort these, in-place (it's a copy of the data, anyways). - std::sort(dimVec.begin(), dimVec.end()); + typedef std::pair SplitItem; + const typename MatType::row_type dimVec = + arma::sort(data(dim, arma::span(start, end - 1))); // Ensure the minimum leaf size on both sides. We need to figure out why // there are spikes if this minLeafSize is enforced here... for (size_t i = minLeafSize - 1; i < dimVec.n_elem - minLeafSize; ++i) { - // This makes sense for real continuous data. This kinda corrupts the - // data and estimation if the data is ordinal. + // This makes sense for real continuous data. This kinda corrupts the + // data and estimation if the data is ordinal. Potentially we can fix + // that by taking into account ordinality later in the min/max update, + // but then we can end-up with a zero-volumed dimension. No good. const ElemType split = (dimVec[i] + dimVec[i + 1]) / 2.0; // Check if we can split here (two points are different) if (split != dimVec[i]) - splitVec.push_back(SplitItem(split, i)); + splitVec.push_back(SplitItem(split, i + 1)); + } + } + + // Now the custom arma::Mat implementation + template + void ExtractSplits(std::vector>& splitVec, + const arma::Mat& data, + size_t dim, + const size_t start, + const size_t end, + const size_t minLeafSize) + { + typedef std::pair SplitItem; + arma::vec dimVec = data(dim, arma::span(start, end - 1)); + + // We sort these, in-place (it's a copy of the data, anyways). + std::sort(dimVec.begin(), dimVec.end()); + + for (size_t i = minLeafSize - 1; i < dimVec.n_elem - minLeafSize; ++i) + { + // This makes sense for real continuous data. This kinda corrupts the + // data and estimation if the data is ordinal. Potentially we can fix + // that by taking into account ordinality later in the min/max update, + // but then we can end-up with a zero-volumed dimension. No good. + const ElemType split = (dimVec[i] + dimVec[i + 1]) / 2.0; + + if (split != dimVec[i]) + splitVec.push_back(SplitItem(split, i + 1)); } } @@ -57,10 +88,13 @@ namespace details void ExtractSplits(std::vector>& splitVec, const arma::SpMat& data, size_t dim, - size_t start, - size_t end, - size_t minLeafSize) + const size_t start, + const size_t end, + const size_t minLeafSize) { + // It's common sense, but we also use it in a check later. + Log::Assert(minLeafSize > 0); + typedef std::pair SplitItem; const size_t n_elem = end - start; @@ -73,8 +107,9 @@ namespace details // Now iterate over the values, taking account for the over-the-zeroes // jump and construct the splits vector. + const size_t zeroes = n_elem - valsVec.size(); ElemType lastVal = -std::numeric_limits::max(); - size_t padding = 0, zeroes = n_elem - valsVec.size(); + size_t padding = 0; for (size_t i = 0; i < valsVec.size(); ++i) { @@ -82,10 +117,10 @@ namespace details if (lastVal < ElemType(0) && newVal > ElemType(0) && zeroes > 0) { Log::Assert(padding == 0); // we should arrive here once! - if (lastVal >= valsVec[0] && // i.e. we're not in the beginning - i >= minLeafSize && - i <= n_elem - minLeafSize) - splitVec.push_back(SplitItem(lastVal / 2.0, i - 1)); + + // the minLeafSize > 0 also guarantees we're not entering right at the start. + if (i >= minLeafSize && i <= n_elem - minLeafSize) + splitVec.push_back(SplitItem(lastVal / 2.0, i)); padding = zeroes; lastVal = ElemType(0); @@ -95,12 +130,14 @@ namespace details if (i + padding >= minLeafSize && i + padding <= n_elem - minLeafSize) { // This makes sense for real continuous data. This kinda corrupts the - // data and estimation if the data is ordinal. + // data and estimation if the data is ordinal. Potentially we can fix + // that by taking into account ordinality later in the min/max update, + // but then we can end-up with a zero-volumed dimension. No good. const ElemType split = (lastVal + newVal) / 2.0; // Check if we can split here (two points are different) if (split != newVal) - splitVec.push_back(SplitItem(split, i + padding - 1)); + splitVec.push_back(SplitItem(split, i + padding)); } lastVal = newVal; @@ -287,7 +324,10 @@ bool DTree::FindSplit(const MatType& data, if (max - min == 0.0) continue; // Skip to next dimension. - // Initializing all the stuff for this dimension. + // Find the log volume of all the other dimensions. + const double volumeWithoutDim = logVolume - std::log(max - min); + + // Initializing all other stuff for this dimension. bool dimSplitFound = false; // Take an error estimate for this dimension. double minDimError = std::pow(points, 2.0) / (max - min); @@ -295,9 +335,6 @@ bool DTree::FindSplit(const MatType& data, double dimRightError = 0.0; // always be set to something else before use. ElemType dimSplitValue = 0.0; - // Find the log volume of all the other dimensions. - double volumeWithoutDim = logVolume - std::log(max - min); - // Get the values for splitting. The old implementation: // dimVec = data.row(dim).subvec(start, end - 1); // dimVec = arma::sort(dimVec); @@ -305,7 +342,7 @@ bool DTree::FindSplit(const MatType& data, // This one has custom implementation for dense and sparse matrices. std::vector splitVec; - details::ExtractSplits(splitVec, data, dim, start, end, minLeafSize); + details::ExtractSplits(splitVec, data, dim, start, end, minLeafSize); // Iterate on all the splits for this dimension for (typename std::vector::iterator i = splitVec.begin(); @@ -321,7 +358,7 @@ bool DTree::FindSplit(const MatType& data, { // Ensure that the right node will have at least the minimum number of // points. - Log::Assert((points - position - 1) >= minLeafSize); + Log::Assert((points - position) >= minLeafSize); // Now we have to see if the error will be reduced. Simple manipulation // of the error function gives us the condition we must satisfy: @@ -329,10 +366,8 @@ bool DTree::FindSplit(const MatType& data, // and because the volume is only dependent on the dimension we are // splitting, we can assume V_l is just the range of the left and V_r is // just the range of the right. - double negLeftError = std::pow(position + 1, 2.0) - / (split - min); - double negRightError = std::pow(points - position - 1, 2.0) - / (max - split); + double negLeftError = std::pow(position, 2.0) / (split - min); + double negRightError = std::pow(points - position, 2.0) / (max - split); // If this is better, take it. if ((negLeftError + negRightError) >= minDimError) @@ -346,7 +381,7 @@ bool DTree::FindSplit(const MatType& data, } } - double actualMinDimError = std::log(minDimError) + const double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim;