New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GSOC]Binarize Function + Test #666
Changes from 2 commits
2ff2fa1
095842f
d1f974a
57a1b19
57ec362
d797162
cd0a377
c271565
f7acc86
e3fc85e
55567aa
326dea0
3deff62
0977c1d
251c110
b54902e
3f70a3d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
/** | ||
* @file binarize.hpp | ||
* @author Keon Kim | ||
* | ||
* Defines Binarize(), a utility function, sets values to 0 or 1 | ||
* to a given threshold. | ||
*/ | ||
#ifndef MLPACK_CORE_DATA_BINARIZE_HPP | ||
#define MLPACK_CORE_DATA_BINARIZE_HPP | ||
|
||
#include <mlpack/core.hpp> | ||
|
||
namespace mlpack { | ||
namespace data { | ||
/** | ||
* Given an input dataset and threshold, set values greater than threshold to | ||
* 1 and values less than or equal to the threshold to 0. This overload takes | ||
* a dimension and applys the changes to the given dimension. | ||
* | ||
* @code | ||
* arma::mat input = loadData(); | ||
* double threshold = 0; | ||
* size_t dimension = 0; | ||
* | ||
* // Binarize the first dimension. All positive values in the first dimension | ||
* // will be set to 1 and the values less than or equal to 0 will become 0. | ||
* Binarize(input, threshold, dimension); | ||
* @endcode | ||
* | ||
* @param input Input matrix to Binarize. | ||
* @param threshold Threshold can by any number. | ||
* @param dimension Feature to apply the Binarize function. | ||
*/ | ||
template<typename T> | ||
void Binarize(arma::Mat<T>& input, | ||
const double threshold, | ||
const size_t dimension) | ||
{ | ||
for (size_t i = 0; i < input.n_cols; ++i) | ||
{ | ||
if (input(dimension, i) > threshold) | ||
input(dimension, i) = 1; | ||
else | ||
input(dimension, i) = 0; | ||
} | ||
} | ||
|
||
/** | ||
* Given an input dataset and threshold, set values greater than threshold to | ||
* 1 and values less than or equal to the threshold to 0. This overload applies | ||
* the changes to all dimensions. | ||
* | ||
* @code | ||
* arma::mat input = loadData(); | ||
* double threshold = 0; | ||
* | ||
* // Binarize the whole Matrix. All positive values in will be set to 1 and | ||
* // the values less than or equal to 0 will become 0. | ||
* Binarize(input, threshold); | ||
* @endcode | ||
* | ||
* @param input Input matrix to Binarize. | ||
* @param threshold Threshold can by any number. | ||
*/ | ||
template<typename T> | ||
void Binarize(arma::Mat<T>& input, | ||
const double threshold) | ||
{ | ||
for (size_t i = 0; i < input.n_rows; ++i) | ||
Binarize(input, threshold, i); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Armadillo matrices are column-major, but this calculation accesses the matrix in a row-major way. So it would be faster to just loop over all elements in the matrix instead of calling the other overload of Binarize(). |
||
} | ||
|
||
} // namespace data | ||
} // namespace mlpack | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
/** | ||
* @file binarize_test.cpp | ||
* @author Keon Kim | ||
* | ||
* Test the Binarzie method. | ||
*/ | ||
#include <mlpack/core.hpp> | ||
#include <mlpack/core/data/binarize.hpp> | ||
#include <mlpack/core/math/random.hpp> | ||
|
||
#include <boost/test/unit_test.hpp> | ||
#include "old_boost_test_definitions.hpp" | ||
|
||
using namespace mlpack; | ||
using namespace arma; | ||
using namespace mlpack::data; | ||
|
||
BOOST_AUTO_TEST_SUITE(BinarizeTest); | ||
|
||
/** | ||
* Compare the binarized data with answer. | ||
* | ||
* @param input The original data set before Binarize. | ||
* @param answer The data want to compare with the input. | ||
*/ | ||
void CheckAnswer(const mat& input, | ||
const umat& answer) | ||
{ | ||
for (size_t i = 0; i < input.n_cols; ++i) | ||
{ | ||
const mat& lhsCol = input.col(i); | ||
const umat& rhsCol = answer.col(i); | ||
for (size_t j = 0; j < lhsCol.n_rows; ++j) | ||
{ | ||
if (std::abs(rhsCol(j)) < 1e-5) | ||
BOOST_REQUIRE_SMALL(lhsCol(j), 1e-5); | ||
else | ||
BOOST_REQUIRE_CLOSE(lhsCol(j), rhsCol(j), 1e-5); | ||
} | ||
} | ||
} | ||
|
||
BOOST_AUTO_TEST_CASE(BinarizeThreshold) | ||
{ | ||
mat input(10, 10, fill::randu); // fill input with randome Number | ||
mat constMat(10, 10); | ||
math::RandomSeed((size_t) std::time(NULL)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should avoid setting the random seed in the tests, this can make specific test errors really hard to reproduce. What I like to do is set the random seed like you did here and run like 1000 tests on my local machine to make sure it works, then remove the line that sets the seed. |
||
double threshold = math::Random(); // random number threshold | ||
constMat.fill(threshold); | ||
|
||
umat answer = input > constMat; | ||
|
||
// Binarize every values inside the matrix with threshold of 0; | ||
Binarize(input, threshold); | ||
|
||
CheckAnswer(input, answer); | ||
} | ||
|
||
/** | ||
* The same test as above, but on a larger dataset. | ||
*/ | ||
BOOST_AUTO_TEST_CASE(BinarizeThresholdLargerTest) | ||
{ | ||
mat input(10, 500, fill::randu); // fill input with randome Number | ||
mat constMat(10, 500); | ||
math::RandomSeed((size_t) std::time(NULL)); | ||
double threshold = math::Random(); // random number threshold | ||
constMat.fill(threshold); | ||
|
||
umat answer = input > constMat; | ||
|
||
// Binarize every values inside the matrix with threshold of 0; | ||
Binarize(input, threshold); | ||
|
||
CheckAnswer(input, answer); | ||
} | ||
|
||
BOOST_AUTO_TEST_SUITE_END(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you think it's reasonable to provide an interface that enables the user to set output matrix? e.g.
void Binarize(arma::Mat<T>& input, arma::Mat<T>& output, const double threshold)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Many other functions in mlpack seems to provide these kind of interfaces.
so yea, I'll add
void Binarize(arma::Mat<T>& input, arma::Mat<T>& output, const double threshold)
.