diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index ea1ac68fbf5..fdc47b750f4 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -44,9 +44,9 @@ class Imputer } /** - * Given an input dataset, replace missing values with given imputation - * strategy. This overload does not produce output matrix, but overwrites the - * result into the input matrix. + * Given an input dataset, replace missing values of a dimension with given + * imputation strategy. This function does not produce output matrix, but + * overwrites the result into the input matrix. * * @param input Input dataset to apply imputation. * @oaran missingValue User defined missing value; it can be anything. diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index ff60a5a0ce5..2ac366450d8 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -86,11 +86,8 @@ class MissingPolicy } else { - // This string already exists in the mapping - // or not included in missingSet. - // Unlike IncrementPolicy, MissingPolicy counts all mapped values. - size_t& numMappings = maps[dimension].second; - ++numMappings; + // This string already exists in the mapping or not included in + // the missingSet. return NaN; } } diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 2863b3e65a3..963ab6c5cf1 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -32,7 +32,9 @@ PROGRAM_INFO("Impute Data", "This utility takes a dataset and converts user " PARAM_STRING_IN_REQ("input_file", "File containing data,", "i"); PARAM_STRING_OUT("output_file", "File to save output", "o"); PARAM_STRING_IN("missing_value", "User defined missing value", "m", ""); -PARAM_STRING_IN("strategy", "imputation strategy to be applied", "s", ""); +PARAM_STRING_IN("strategy", "imputation strategy to be applied. Strategies " + "should be one of 'custom', 'mean', 'median', and 'listwise_deletion'.", + "s", ""); PARAM_DOUBLE_IN("custom_value", "user_defined custom value", "c", 0.0); PARAM_INT_IN("dimension", "the dimension to apply imputation", "d", 0); @@ -92,83 +94,98 @@ int main(int argc, char** argv) << "'custom' strategy" << endl; arma::mat input; - arma::mat output; // Policy tells how the DatasetMapper should map the values. std::set missingSet; missingSet.insert(missingValue); MissingPolicy policy(missingSet); using MapperType = DatasetMapper; DatasetMapper info(policy); - std::vector dirtyDimensions; Load(inputFile, input, info, true, true); // print how many mapping exist in each dimensions + std::vector dirtyDimensions; for (size_t i = 0; i < input.n_rows; ++i) { size_t numMappings = info.NumMappings(i); - Log::Info << numMappings << " mappings in dimension " << i << "." - << endl; if (numMappings > 0) { + Log::Info << "Replacing " << numMappings << " values in dimension " << i + << "." << endl; dirtyDimensions.push_back(i); } } - // Initialize imputer class - Imputer> imputer(info); - if (strategy == "mean") - { - Imputer> imputer(info); - } - else if (strategy == "median") - { - Imputer> imputer(info); - } - else if (strategy == "listwise_deletion") + if (dirtyDimensions.size() == 0) { - Imputer> imputer(info); + Log::Warn << "The file does not contain any user-defined missing " + << "variables. The program did not perform any imputation." << endl; } - else if (strategy == "custom") + else if (CLI::HasParam("dimension") && + !(std::find(dirtyDimensions.begin(), dirtyDimensions.end(), dimension) + != dirtyDimensions.end())) { - CustomImputation strat(customValue); - Imputer> imputer(info, strat); + Log::Warn << "The given dimension of the file does not contain any " + << "user-defined missing variables. The program did not perform any " + << "imputation." << endl; } else { - Log::Fatal << "'" << strategy << "' imputation strategy does not exist" - << endl; - } - - Timer::Start("imputation"); - if (CLI::HasParam("dimension")) - { - // when --dimension is specified, - // the program will apply the changes to only the given dimension. - Log::Info << "Performing '" << strategy << "' imputation strategy " - << "to replace '" << missingValue << "' on dimension " << dimension - << "." << endl; + // Initialize imputer class + Imputer> imputer(info); + if (strategy == "mean") + { + Imputer> imputer(info); + } + else if (strategy == "median") + { + Imputer> imputer(info); + } + else if (strategy == "listwise_deletion") + { + Imputer> imputer(info); + } + else if (strategy == "custom") + { + CustomImputation strat(customValue); + Imputer> imputer(info, strat); + } + else + { + Log::Fatal << "'" << strategy << "' imputation strategy does not exist" + << endl; + } - imputer.Impute(input, missingValue, dimension); - } - else - { - // when --dimension is not specified, - // the program will apply the changes to all dimensions. - Log::Info << "Performing '" << strategy << "' imputation strategy " - << "to replace '" << missingValue << "' on all dimensions." << endl; + Timer::Start("imputation"); + if (CLI::HasParam("dimension")) + { + // when --dimension is specified, + // the program will apply the changes to only the given dimension. + Log::Info << "Performing '" << strategy << "' imputation strategy " + << "to replace '" << missingValue << "' on dimension " << dimension + << "." << endl; - for (size_t i : dirtyDimensions) + imputer.Impute(input, missingValue, dimension); + } + else { - imputer.Impute(input, missingValue, i); + // when --dimension is not specified, + // the program will apply the changes to all dimensions. + Log::Info << "Performing '" << strategy << "' imputation strategy " + << "to replace '" << missingValue << "' on all dimensions." << endl; + + for (size_t i : dirtyDimensions) + { + imputer.Impute(input, missingValue, i); + } } - } - Timer::Stop("imputation"); + Timer::Stop("imputation"); - if (!outputFile.empty()) - { - Log::Info << "Saving results to '" << outputFile << "'." << endl; - Save(outputFile, input, false); + if (!outputFile.empty()) + { + Log::Info << "Saving results to '" << outputFile << "'." << endl; + Save(outputFile, input, false); + } } } diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index ce48ad0bddb..3289984ebc4 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -40,19 +40,16 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) f.close(); arma::mat input; - - std::set mset; - mset.insert("a"); - MissingPolicy policy(mset); + MissingPolicy policy({"a"}); DatasetMapper info(policy); BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true); - // row and column test + // row and column test. BOOST_REQUIRE_EQUAL(input.n_rows, 3); BOOST_REQUIRE_EQUAL(input.n_cols, 3); // Load check - // MissingPolicy should convert strings to nans + // MissingPolicy should convert strings to nans. BOOST_REQUIRE(std::isnan(input(0, 0)) == true); BOOST_REQUIRE_CLOSE(input(0, 1), 5.0, 1e-5); BOOST_REQUIRE_CLOSE(input(0, 2), 8.0, 1e-5); @@ -68,10 +65,10 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) Imputer, CustomImputation> imputer(info, customStrategy); - // convert a or nan to 99 for dimension 0 + // convert a or nan to 99 for dimension 0. imputer.Impute(input, "a", 0); - // Custom imputation result check + // Custom imputation result check. BOOST_REQUIRE_CLOSE(input(0, 0), 99.0, 1e-5); BOOST_REQUIRE_CLOSE(input(0, 1), 5.0, 1e-5); BOOST_REQUIRE_CLOSE(input(0, 2), 8.0, 1e-5);