From 400e2ebe6eada7457b4c951f6128190c2b07cdb1 Mon Sep 17 00:00:00 2001 From: Tiramisu 1993 Date: Sat, 18 Feb 2017 13:10:18 +0800 Subject: [PATCH] Implement multidimensional discrete distribution --- .../core/dists/discrete_distribution.cpp | 138 +++++++++++------- .../core/dists/discrete_distribution.hpp | 91 ++++++++---- src/mlpack/tests/distribution_test.cpp | 67 +++++++++ src/mlpack/tests/hmm_test.cpp | 14 +- src/mlpack/tests/serialization_test.cpp | 3 +- 5 files changed, 228 insertions(+), 85 deletions(-) diff --git a/src/mlpack/core/dists/discrete_distribution.cpp b/src/mlpack/core/dists/discrete_distribution.cpp index b93004e9419..66b1fa5a246 100644 --- a/src/mlpack/core/dists/discrete_distribution.cpp +++ b/src/mlpack/core/dists/discrete_distribution.cpp @@ -20,22 +20,29 @@ using namespace mlpack::distribution; */ arma::vec DiscreteDistribution::Random() const { - // Generate a random number. - double randObs = math::Random(); - arma::vec result(1); + size_t dimension = probabilities.size(); + arma::vec result(dimension); - double sumProb = 0; - for (size_t obs = 0; obs < probabilities.n_elem; obs++) + for (size_t d = 0; d < dimension; d++) { - if ((sumProb += probabilities[obs]) >= randObs) + // Generate a random number. + double randObs = math::Random(); + + double sumProb = 0; + + for (size_t obs = 0; obs < probabilities[d].n_elem; obs++) { - result[0] = obs; - return result; + if ((sumProb += probabilities[d][obs]) >= randObs) + { + result[d] = obs; + break; + } } - } - // This shouldn't happen. - result[0] = probabilities.n_elem - 1; + if (sumProb >= randObs != true) + // This shouldn't happen. + result[d] = probabilities[d].n_elem - 1; + } return result; } @@ -44,33 +51,45 @@ arma::vec DiscreteDistribution::Random() const */ void DiscreteDistribution::Train(const arma::mat& observations) { - // Clear old probabilities. - probabilities.zeros(); + // Make sure the observations have same dimension with the probabilities + if(observations.n_rows != probabilities.size()) + { + Log::Debug << "the obversation must has the same dimension with the probabilities" + << "the observation's dimension is" << observations.n_cols << "but the dimension of " + << "probabilities is" << probabilities.size() << std::endl; + } + // Get the dimension size of the distribution + const size_t dimensions = probabilities.size(); - // Add the probability of each observation. The addition of 0.5 to the - // observation is to turn the default flooring operation of the size_t cast - // into a rounding operation. - for (size_t i = 0; i < observations.n_cols; i++) + // Iterate all the probabilities in each dimension + for (size_t i=0; i < dimensions; i++) { - const size_t obs = size_t(observations(0, i) + 0.5); + // Clear the old probabilities + probabilities[i].zeros(); + for (size_t r=0; r < observations.n_cols; r++) + { + // Add the probability of each observation. The addition of 0.5 to the + // observation is to turn the default flooring operation of the size_t cast + // into a rounding observation. + const size_t obs = size_t(observations(i, r) + 0.5); - // Ensure that the observation is within the bounds. - if (obs >= probabilities.n_elem) - { - Log::Debug << "DiscreteDistribution::Train(): observation " << i - << " (" << obs << ") is invalid; observation must be in [0, " - << probabilities.n_elem << "] for this distribution." << std::endl; - } + // Ensure that the observation is within the bounds. + if (obs >= probabilities[i].n_elem) + { + Log::Debug << "DiscreteDistribution::Train(): observation " << i + << " (" << obs << ") is invalid; observation must be in [0, " + << probabilities[i].n_elem << "] for this distribution." << std::endl; + } + probabilities[i][obs]++; + } - probabilities(obs)++; + // Now normailze the distribution. + double sum = accu(probabilities[i]); + if (sum > 0) + probabilities[i] /= sum; + else + probabilities[i].fill(1.0 / probabilities[i].n_elem); // Force normalization. } - - // Now normalize the distribution. - double sum = accu(probabilities); - if (sum > 0) - probabilities /= sum; - else - probabilities.fill(1 / probabilities.n_elem); // Force normalization. } /** @@ -80,31 +99,46 @@ void DiscreteDistribution::Train(const arma::mat& observations) void DiscreteDistribution::Train(const arma::mat& observations, const arma::vec& probObs) { - // Clear old probabilities. - probabilities.zeros(); + // Make sure the observations have same dimension with the probabilities + if(observations.n_rows != probabilities.size()) + { + Log::Debug << "the obversation must has the same dimension with the probabilities" + << "the observation's dimension is" << observations.n_rows<< "but the dimension of " + << "probabilities is" << probabilities.size() << std::endl; + } - // Add the probability of each observation. The addition of 0.5 to the - // observation is to turn the default flooring operation of the size_t cast - // into a rounding observation. - for (size_t i = 0; i < observations.n_cols; i++) + // Get the dimension size of the distribution + size_t dimensions = probabilities.size(); + for (size_t i=0; i < dimensions; i++) { - const size_t obs = size_t(observations(0, i) + 0.5); + // Clear the old probabilities + probabilities[i].zeros(); // Ensure that the observation is within the bounds. - if (obs >= probabilities.n_elem) + for (size_t r=0; r < observations.n_cols; r++) { - Log::Debug << "DiscreteDistribution::Train(): observation " << i - << " (" << obs << ") is invalid; observation must be in [0, " - << probabilities.n_elem << "] for this distribution." << std::endl; + // Add the probability of each observation. The addition of 0.5 to the + // observation is to turn the default flooring operation of the size_t cast + // into a rounding observation. + + const size_t obs = size_t(observations(i, r)+ 0.5); + + // Ensure that the observation is within the bounds. + if (obs >= probabilities[i].n_elem) + { + Log::Debug << "DiscreteDistribution::Train(): observation " << i + << " (" << obs << ") is invalid; observation must be in [0, " + << probabilities[i].n_elem << "] for this distribution." << std::endl; + } + + probabilities[i][obs] += probObs[r]; } - probabilities(obs) += probObs[i]; + // Now normailze the distribution. + double sum = accu(probabilities[i]); + if (sum > 0) + probabilities[i] /= sum; + else + probabilities[i].fill(1.0 / probabilities[i].n_elem); // Force normalization. } - - // Now normalize the distribution. - double sum = accu(probabilities); - if (sum > 0) - probabilities /= sum; - else - probabilities.fill(1 / probabilities.n_elem); // Force normalization. } diff --git a/src/mlpack/core/dists/discrete_distribution.hpp b/src/mlpack/core/dists/discrete_distribution.hpp index a31ca0ca531..f3ccded79f2 100644 --- a/src/mlpack/core/dists/discrete_distribution.hpp +++ b/src/mlpack/core/dists/discrete_distribution.hpp @@ -48,7 +48,8 @@ class DiscreteDistribution /** * Default constructor, which creates a distribution that has no observations. */ - DiscreteDistribution() { /* nothing to do */ } + DiscreteDistribution() : + probabilities(std::vector(1)){ /* nothing to do */ } /** * Define the discrete distribution as having numObservations possible @@ -59,32 +60,56 @@ class DiscreteDistribution * can have. */ DiscreteDistribution(const size_t numObservations) : - probabilities(arma::ones(numObservations) / numObservations) + probabilities(std::vector(1, arma::ones(numObservations)/numObservations)) { /* nothing to do */ } /** - * Define the discrete distribution as having the given probabilities for each + * Define the multidimensional discrete distribution as having numObservations possible + * observations. The probability in each state will be set to (1 / + * numObservations of each dimension). + * + * @param numObservations Number of possible observations this distribution + * can have. + */ + DiscreteDistribution(const arma::vec& numObservations) + { + for (size_t i=0; i(numObs)/numObs); + } + } + + /** + * Define the multidimensional discrete distribution as having the given probabilities for each * observation. * * @param probabilities Probabilities of each possible observation. */ - DiscreteDistribution(const arma::vec& probabilities) + DiscreteDistribution(const std::vector& probabilities) { - // We must be sure that our distribution is normalized. - double sum = accu(probabilities); - if (sum > 0) - this->probabilities = probabilities / sum; - else + for (size_t i=0; iprobabilities.set_size(probabilities.n_elem); - this->probabilities.fill(1 / probabilities.n_elem); + arma::vec temp = probabilities[i]; + double sum = accu(temp); + if (sum > 0) + this->probabilities.push_back(temp / sum); + else + { + this->probabilities.push_back(arma::ones(temp.n_elem)/temp.n_elem); + } } } /** * Get the dimensionality of the distribution. */ - static size_t Dimensionality() { return 1; } + size_t Dimensionality() const { return probabilities.size(); } /** * Return the probability of the given observation. If the observation is @@ -96,19 +121,32 @@ class DiscreteDistribution */ double Probability(const arma::vec& observation) const { - // Adding 0.5 helps ensure that we cast the floating point to a size_t - // correctly. - const size_t obs = size_t(observation[0] + 0.5); - - // Ensure that the observation is within the bounds. - if (obs >= probabilities.n_elem) + double probability = 1.0; + // Ensure the observation has the same dimension with the probabilities + if (observation.n_elem != probabilities.size()) { - Log::Debug << "DiscreteDistribution::Probability(): received observation " - << obs << "; observation must be in [0, " << probabilities.n_elem - << "] for this distribution." << std::endl; + Log::Debug << "the obversation must has the same dimension with the probabilities" + << "the observation's dimension is" << observation.n_elem << "but the dimension of " + << "probabilities is" << probabilities.size() << std::endl; + return probability; + } + for (size_t dimension = 0; dimension < observation.n_elem; dimension++) + { + // Adding 0.5 helps ensure that we cast the floating point to a size_t + // correctly. + const size_t obs = size_t(observation(dimension) + 0.5); + + // Ensure that the observation is within the bounds. + if (obs >= probabilities[dimension].n_elem) + { + Log::Debug << "DiscreteDistribution::Probability(): received observation " + << obs << "; observation must be in [0, " << probabilities[dimension].n_elem + << "] for this distribution." << std::endl; + } + probability *= probabilities[dimension][obs]; } - return probabilities(obs); + return probability; } /** @@ -156,9 +194,10 @@ class DiscreteDistribution const arma::vec& probabilities); //! Return the vector of probabilities. - const arma::vec& Probabilities() const { return probabilities; } + arma::vec& Probabilities(const size_t dim = 0) { return probabilities[dim];} + //! Modify the vector of probabilities. - arma::vec& Probabilities() { return probabilities; } + const arma::vec& Probabilities(const size_t dim = 0) const { return probabilities[dim];} /** * Serialize the distribution. @@ -171,7 +210,9 @@ class DiscreteDistribution } private: - arma::vec probabilities; + // The Probability Martix which each column represent one dimension and + // the row represent the observations in that dimension. + std::vector probabilities; }; } // namespace distribution diff --git a/src/mlpack/tests/distribution_test.cpp b/src/mlpack/tests/distribution_test.cpp index ce162a00b70..5146bca6a61 100644 --- a/src/mlpack/tests/distribution_test.cpp +++ b/src/mlpack/tests/distribution_test.cpp @@ -118,6 +118,73 @@ BOOST_AUTO_TEST_CASE(DiscreteDistributionTrainProbTest) BOOST_REQUIRE_CLOSE(d.Probability("2"), 0.5, 1e-5); } +/** + * Achieve multidimensional probability distribution + */ +BOOST_AUTO_TEST_CASE(MultiDiscreteDistributionTrainProbTest) +{ + DiscreteDistribution d("10 10 10"); + + arma::mat obs("0 1 1 1 2 2 2 2 2 2;" + "0 0 0 1 1 1 2 2 2 2;" + "0 0 0 1 1 2 2 2 2 2;"); + + d.Train(obs); + BOOST_REQUIRE_CLOSE(d.Probability("0 0 0"), 0.009, 1e-5); + BOOST_REQUIRE_CLOSE(d.Probability("0 1 2"), 0.015, 1e-5); + BOOST_REQUIRE_CLOSE(d.Probability("2 1 0"), 0.054, 1e-5); +} + +/** + * Make sure we initialize multidimensional probability distribution + * correctly. + */ +BOOST_AUTO_TEST_CASE(MultiDiscreteDistributionConstructorTest) +{ + DiscreteDistribution d("4 4 4 4"); + + BOOST_REQUIRE_EQUAL(d.Probabilities(0).size(), 4); + BOOST_REQUIRE_EQUAL(d.Dimensionality(), 4); + BOOST_REQUIRE_CLOSE(d.Probability("0 0 0 0"), 0.00390625, 1e-5); + BOOST_REQUIRE_CLOSE(d.Probability("0 1 2 3"), 0.00390625, 1e-5); +} + +/** + * Achieve multidimensional probability distribution + */ +BOOST_AUTO_TEST_CASE(MultiDiscreteDistributionTrainTest) +{ + std::vector pro{{0.1, 0.3, 0.6}, + {0.3, 0.3, 0.3}, + {0.25, 0.25, 0.5}}; + + DiscreteDistribution d(pro); + + BOOST_REQUIRE_CLOSE(d.Probability("0 0 0"), 0.0083333, 1e-3); + BOOST_REQUIRE_CLOSE(d.Probability("0 1 2"), 0.0166666, 1e-3); + BOOST_REQUIRE_CLOSE(d.Probability("2 1 0"), 0.05, 1e-5); +} + +/** + * Estimate multidimensional probability distribution from observations with probabilities. + */ +BOOST_AUTO_TEST_CASE(MultiDiscreteDistributionTrainProTest) +{ + DiscreteDistribution d("5 5 5"); + + arma::mat obs("0 0 1 1 2;" + "0 1 1 2 2;" + "0 1 1 2 2"); + + arma::vec prob("0.25 0.25 0.25 0.25 1"); + + d.Train(obs, prob); + + BOOST_REQUIRE_CLOSE(d.Probability("0 0 0"), 0.00390625, 1e-5); + BOOST_REQUIRE_CLOSE(d.Probability("1 0 1"), 0.0078125, 1e-5); + BOOST_REQUIRE_CLOSE(d.Probability("2 1 0"), 0.015625, 1e-5); +} + /*********************************/ /** Gaussian Distribution Tests **/ /*********************************/ diff --git a/src/mlpack/tests/hmm_test.cpp b/src/mlpack/tests/hmm_test.cpp index c9016c51bff..41175715084 100644 --- a/src/mlpack/tests/hmm_test.cpp +++ b/src/mlpack/tests/hmm_test.cpp @@ -41,8 +41,8 @@ BOOST_AUTO_TEST_CASE(SimpleDiscreteHMMTestViterbi) arma::vec initial("1 0"); // Default MATLAB initial states. arma::mat transition("0.7 0.3; 0.3 0.7"); std::vector emission(2); - emission[0] = DiscreteDistribution("0.9 0.1"); - emission[1] = DiscreteDistribution("0.2 0.8"); + emission[0] = DiscreteDistribution(std::vector{"0.9 0.1"}); + emission[1] = DiscreteDistribution(std::vector{"0.2 0.8"}); HMM hmm(initial, transition, emission); @@ -78,9 +78,9 @@ BOOST_AUTO_TEST_CASE(BorodovskyHMMTestViterbi) "0.5 0.5 0.6"); // Four emission states: A, C, G, T. Start state doesn't emit... std::vector emission(3); - emission[0] = DiscreteDistribution("0.25 0.25 0.25 0.25"); - emission[1] = DiscreteDistribution("0.20 0.30 0.30 0.20"); - emission[2] = DiscreteDistribution("0.30 0.20 0.20 0.30"); + emission[0] = DiscreteDistribution(std::vector{"0.25 0.25 0.25 0.25"}); + emission[1] = DiscreteDistribution(std::vector{"0.20 0.30 0.30 0.20"}); + emission[2] = DiscreteDistribution(std::vector{"0.30 0.20 0.20 0.30"}); HMM hmm(initial, transition, emission); @@ -118,8 +118,8 @@ BOOST_AUTO_TEST_CASE(ForwardBackwardTwoState) arma::vec initial("0.1 0.4"); arma::mat transition("0.1 0.9; 0.4 0.6"); std::vector emis(2); - emis[0] = DiscreteDistribution("0.85 0.15 0.00 0.00"); - emis[1] = DiscreteDistribution("0.00 0.00 0.50 0.50"); + emis[0] = DiscreteDistribution(std::vector{"0.85 0.15 0.00 0.00"}); + emis[1] = DiscreteDistribution(std::vector{"0.00 0.00 0.50 0.50"}); HMM hmm(initial, transition, emis); diff --git a/src/mlpack/tests/serialization_test.cpp b/src/mlpack/tests/serialization_test.cpp index 750dc9715cc..de3ba4f9380 100644 --- a/src/mlpack/tests/serialization_test.cpp +++ b/src/mlpack/tests/serialization_test.cpp @@ -156,7 +156,8 @@ BOOST_AUTO_TEST_CASE(DiscreteDistributionTest) // straightforward. vec prob; prob.randu(12); - DiscreteDistribution t(prob); + std::vector prob_vector = std::vector(1, prob); + DiscreteDistribution t(prob_vector); DiscreteDistribution xmlT, textT, binaryT;