From 87c05a55d521037605d02cfc422afd22650c7623 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Wed, 1 Jun 2016 14:35:58 +0900 Subject: [PATCH 01/40] concept work for imputer --- src/mlpack/core/data/CMakeLists.txt | 1 + src/mlpack/core/data/dataset_info.hpp | 15 +++- src/mlpack/core/data/dataset_info_impl.hpp | 31 +++++++- src/mlpack/core/data/imputer.hpp | 72 +++++++++++++++++++ src/mlpack/core/data/load_impl.hpp | 12 +++- src/mlpack/methods/preprocess/CMakeLists.txt | 2 +- .../preprocess/preprocess_imputer_main.cpp | 61 ++++++++++++++++ 7 files changed, 186 insertions(+), 8 deletions(-) create mode 100644 src/mlpack/core/data/imputer.hpp create mode 100644 src/mlpack/methods/preprocess/preprocess_imputer_main.cpp diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index ea87d0f13ab..8252b35be35 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -15,6 +15,7 @@ set(SOURCES save_impl.hpp serialization_shim.hpp split_data.hpp + imputer.hpp ) # add directory name to sources diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 29c7cee8be4..4b96a437b88 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -54,7 +54,7 @@ class DatasetInfo * @param string String to find/create mapping for. * @param dimension Index of the dimension of the string. */ - size_t MapString(const std::string& string, const size_t dimension); + double MapString(const std::string& string, const size_t dimension); /** * Return the string that corresponds to a given value in a given dimension. @@ -66,6 +66,17 @@ class DatasetInfo */ const std::string& UnmapString(const size_t value, const size_t dimension); + + /** + * Return the value that corresponds to a given string in a given dimension. + * If the value is not a valid mapping in the given dimension, a + * std::invalid_argument is thrown. + * + * @param string Mapped string for value. + * @param dimension Dimension to unmap string from. + */ + double UnmapValue(const std::string& string, const size_t dimension); + //! Return the type of a given dimension (numeric or categorical). Datatype Type(const size_t dimension) const; //! Modify the type of a given dimension (be careful!). @@ -101,7 +112,7 @@ class DatasetInfo //! Mappings from strings to integers. Map entries will only exist for //! dimensions that are categorical. - std::unordered_map, + std::unordered_map, size_t>> maps; }; diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index a3ee24dc576..e3c5487033d 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -9,6 +9,7 @@ // In case it hasn't already been included. #include "dataset_info.hpp" +#include namespace mlpack { namespace data { @@ -21,20 +22,27 @@ inline DatasetInfo::DatasetInfo(const size_t dimensionality) : } // Map the string to a numeric id. -inline size_t DatasetInfo::MapString(const std::string& string, +inline double DatasetInfo::MapString(const std::string& string, const size_t dimension) { // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. - if (maps.count(dimension) == 0 || + if (string == "") + { + typedef boost::bimap::value_type PairType; + double nan = std::nan(""); + maps[dimension].first.insert(PairType(string, nan)); + return nan; + } + else if (maps.count(dimension) == 0 || maps[dimension].first.left.count(string) == 0) { // This string does not exist yet. size_t& numMappings = maps[dimension].second; if (numMappings == 0) types[dimension] = Datatype::categorical; - typedef boost::bimap::value_type PairType; + typedef boost::bimap::value_type PairType; maps[dimension].first.insert(PairType(string, numMappings)); return numMappings++; } @@ -62,6 +70,23 @@ inline const std::string& DatasetInfo::UnmapString( return maps[dimension].first.right.at(value); } +// Return the value corresponding to a string in a given dimension. +inline double DatasetInfo::UnmapValue( + const std::string& string, + const size_t dimension) +{ + // Throw an exception if the value doesn't exist. + if (maps[dimension].first.left.count(string) == 0) + { + std::ostringstream oss; + oss << "DatasetInfo::UnmapValue(): string '" << string << "' unknown for " + << "dimension " << dimension; + throw std::invalid_argument(oss.str()); + } + + return maps[dimension].first.left.at(string); +} + // Get the type of a particular dimension. inline Datatype DatasetInfo::Type(const size_t dimension) const { diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp new file mode 100644 index 00000000000..88d6b905cbf --- /dev/null +++ b/src/mlpack/core/data/imputer.hpp @@ -0,0 +1,72 @@ +/** + * @file imputer.hpp + * @author Keon Kim + * + * Defines Imputer(), a utility function to replace missing variables + * in a dataset. + */ +#ifndef MLPACK_CORE_DATA_IMPUTER_HPP +#define MLPACK_CORE_DATA_IMPUTER_HPP + +#include +#include + +namespace mlpack { +namespace data { + +/** + * Given an input dataset, replace missing values with . + * + * @param input Input dataset to apply imputation. + * @param info DatasetInfo object that holds informations about the dataset. + * @param string User-defined missing value + * @param dimension. + */ + +template +void Imputer(arma::Mat& input, + DatasetInfo& info, + const std::string& missingValue, + const size_t dimension, + const std::string& strategy) +{ + Log::Info << "impute using " << strategy << " strategy" << std::endl; + + double mappedValue = info.UnmapValue(missingValue, dimension); + arma::mat stats; + + if (strategy == "mean") + { + stats = arma::mean(input); // mean of columns + } + else if (strategy == "median") + { + stats = arma::median(input); + } + + for (size_t i = 0; i < input.n_cols; ++i) + { + if (std::isnan(input(dimension, i)) || + input(dimension, i) == mappedValue) + { + // just for demo, + Log::Info << "demodemo" << std::endl; + input(dimension, i) = 9999; //stats(0, i); + } + } +} + +template +void Imputer(arma::Mat& input, + DatasetInfo& info, + const std::string& missingValue, + const size_t dimension) +{ + std::string strategy = "mean"; // default strategy + Imputer(input, info, missingValue, dimension, strategy); +} + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 947b3600e97..fb6bc901291 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -432,8 +432,16 @@ bool Load(const std::string& filename, std::stringstream sstr; arma::arma_ostream::print_elem(sstr, matrix.at(i, col), false); - eT newVal = info.MapString(sstr.str(), col); - matrix.at(i, col) = newVal; + if(sstr.str() == "") + { + eT newVal = info.MapString(sstr.str(), col); + matrix.at(i, col) = nan(""); + } + else + { + eT newVal = info.MapString(sstr.str(), col); + matrix.at(i, col) = newVal; + } } } else diff --git a/src/mlpack/methods/preprocess/CMakeLists.txt b/src/mlpack/methods/preprocess/CMakeLists.txt index 3a2f7bf5c9d..14903410ccb 100644 --- a/src/mlpack/methods/preprocess/CMakeLists.txt +++ b/src/mlpack/methods/preprocess/CMakeLists.txt @@ -15,4 +15,4 @@ set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) #add_cli_executable(preprocess_stats) add_cli_executable(preprocess_split) #add_cli_executable(preprocess_scan) -#add_cli_executable(preprocess_imputer) +add_cli_executable(preprocess_imputer) diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp new file mode 100644 index 00000000000..f549b26473e --- /dev/null +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -0,0 +1,61 @@ +/** + * @file preprocess_imputer_main.cpp + * @author Keon Kim + * + * a utility that provides imputation strategies fore + * missing values. + */ +#include +#include + +PROGRAM_INFO("Imputer", "This " + "utility takes an any type of data and provides " + "imputation strategies for missing data."); + +PARAM_STRING_REQ("input_file", "File containing data,", "i"); +PARAM_STRING("missing_value", "User defined missing value", "m", "") +PARAM_INT("feature", "the feature to be analyzed", "f", 0); +PARAM_STRING("output_file", "File to save output", "o", ""); + +using namespace mlpack; +using namespace arma; +using namespace std; + +int main(int argc, char** argv) +{ + // Parse command line options. + CLI::ParseCommandLine(argc, argv); + + const string inputFile = CLI::GetParam("input_file"); + const string missingValue = CLI::GetParam("missing_value"); + const string outputFile = CLI::GetParam("output_file"); + const size_t featureNumber = (size_t) CLI::GetParam("feature"); + + arma::mat data; + data::DatasetInfo info; + + data::Load(inputFile, data, info, true, false); + Log::Debug << "" << endl; + Log::Info << data << endl; + + Log::Info << "dataset info: " << endl; + for (size_t i = 0; i < data.n_rows; ++i) + { + Log::Info << info.NumMappings(i) << " mappings in dimension " + << i << "." << endl; + } + + Log::Info << "Loading feature: " << featureNumber << endl; + data::Imputer(data, info, missingValue, featureNumber); + + Log::Debug << "" << endl; + Log::Info << data << endl; + + + if (!outputFile.empty()) + { + Log::Info << "Saving model to '" << outputFile << "'." << endl; + data::Save(outputFile, data, false); + } +} + From 631e59ee15b30372d245645273b19084f3e8546a Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 6 Jun 2016 22:20:20 +0900 Subject: [PATCH 02/40] do not to use NaN by default, let the user specify --- src/mlpack/core/data/dataset_info.hpp | 6 +++--- src/mlpack/core/data/dataset_info_impl.hpp | 24 +++++++--------------- src/mlpack/core/data/imputer.hpp | 9 +++----- src/mlpack/core/data/load_impl.hpp | 12 ++--------- 4 files changed, 15 insertions(+), 36 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 4b96a437b88..4bfa55c8332 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -54,7 +54,7 @@ class DatasetInfo * @param string String to find/create mapping for. * @param dimension Index of the dimension of the string. */ - double MapString(const std::string& string, const size_t dimension); + size_t MapString(const std::string& string, const size_t dimension); /** * Return the string that corresponds to a given value in a given dimension. @@ -75,7 +75,7 @@ class DatasetInfo * @param string Mapped string for value. * @param dimension Dimension to unmap string from. */ - double UnmapValue(const std::string& string, const size_t dimension); + size_t UnmapValue(const std::string& string, const size_t dimension); //! Return the type of a given dimension (numeric or categorical). Datatype Type(const size_t dimension) const; @@ -112,7 +112,7 @@ class DatasetInfo //! Mappings from strings to integers. Map entries will only exist for //! dimensions that are categorical. - std::unordered_map, + std::unordered_map, size_t>> maps; }; diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index e3c5487033d..9457d0d2b29 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -9,7 +9,6 @@ // In case it hasn't already been included. #include "dataset_info.hpp" -#include namespace mlpack { namespace data { @@ -22,27 +21,20 @@ inline DatasetInfo::DatasetInfo(const size_t dimensionality) : } // Map the string to a numeric id. -inline double DatasetInfo::MapString(const std::string& string, +inline size_t DatasetInfo::MapString(const std::string& string, const size_t dimension) { // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. - if (string == "") - { - typedef boost::bimap::value_type PairType; - double nan = std::nan(""); - maps[dimension].first.insert(PairType(string, nan)); - return nan; - } - else if (maps.count(dimension) == 0 || + if (maps.count(dimension) == 0 || maps[dimension].first.left.count(string) == 0) { // This string does not exist yet. size_t& numMappings = maps[dimension].second; if (numMappings == 0) types[dimension] = Datatype::categorical; - typedef boost::bimap::value_type PairType; + typedef boost::bimap::value_type PairType; maps[dimension].first.insert(PairType(string, numMappings)); return numMappings++; } @@ -54,9 +46,8 @@ inline double DatasetInfo::MapString(const std::string& string, } // Return the string corresponding to a value in a given dimension. -inline const std::string& DatasetInfo::UnmapString( - const size_t value, - const size_t dimension) +inline const std::string& DatasetInfo::UnmapString(const size_t value, + const size_t dimension) { // Throw an exception if the value doesn't exist. if (maps[dimension].first.right.count(value) == 0) @@ -71,9 +62,8 @@ inline const std::string& DatasetInfo::UnmapString( } // Return the value corresponding to a string in a given dimension. -inline double DatasetInfo::UnmapValue( - const std::string& string, - const size_t dimension) +inline size_t DatasetInfo::UnmapValue(const std::string& string, + const size_t dimension) { // Throw an exception if the value doesn't exist. if (maps[dimension].first.left.count(string) == 0) diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index 88d6b905cbf..babba012041 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -9,7 +9,6 @@ #define MLPACK_CORE_DATA_IMPUTER_HPP #include -#include namespace mlpack { namespace data { @@ -32,7 +31,7 @@ void Imputer(arma::Mat& input, { Log::Info << "impute using " << strategy << " strategy" << std::endl; - double mappedValue = info.UnmapValue(missingValue, dimension); + size_t mappedValue = info.UnmapValue(missingValue, dimension); arma::mat stats; if (strategy == "mean") @@ -46,12 +45,10 @@ void Imputer(arma::Mat& input, for (size_t i = 0; i < input.n_cols; ++i) { - if (std::isnan(input(dimension, i)) || - input(dimension, i) == mappedValue) + if (input(dimension, i) == mappedValue) { // just for demo, - Log::Info << "demodemo" << std::endl; - input(dimension, i) = 9999; //stats(0, i); + input(dimension, i) = stats(0, i); } } } diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index fb6bc901291..947b3600e97 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -432,16 +432,8 @@ bool Load(const std::string& filename, std::stringstream sstr; arma::arma_ostream::print_elem(sstr, matrix.at(i, col), false); - if(sstr.str() == "") - { - eT newVal = info.MapString(sstr.str(), col); - matrix.at(i, col) = nan(""); - } - else - { - eT newVal = info.MapString(sstr.str(), col); - matrix.at(i, col) = newVal; - } + eT newVal = info.MapString(sstr.str(), col); + matrix.at(i, col) = newVal; } } else From 6a1fb814e3a4e8e3052d9c9b8b749972f510b5b3 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 13 Jun 2016 08:03:12 +0900 Subject: [PATCH 03/40] add template to datasetinfo and add imputer class --- src/mlpack/core/data/CMakeLists.txt | 1 + src/mlpack/core/data/dataset_info.hpp | 24 +++-- src/mlpack/core/data/dataset_info_impl.hpp | 61 ++++++------ src/mlpack/core/data/dataset_info_rich.hpp | 73 ++++++++++++++ .../data/impute_strategies/CMakeLists.txt | 14 +++ .../impute_strategies/custom_strategy.hpp | 0 .../data/impute_strategies/impute_mean.hpp | 40 ++++++++ .../data/impute_strategies/mode_strategy.hpp | 0 src/mlpack/core/data/imputer.hpp | 96 +++++++++++-------- .../core/data/map_policies/CMakeLists.txt | 15 +++ .../data/map_policies/default_map_policy.hpp | 60 ++++++++++++ .../data/map_policies/missing_map_policy.hpp | 60 ++++++++++++ .../preprocess/preprocess_imputer_main.cpp | 57 +++++++++-- 13 files changed, 414 insertions(+), 87 deletions(-) create mode 100644 src/mlpack/core/data/dataset_info_rich.hpp create mode 100644 src/mlpack/core/data/impute_strategies/CMakeLists.txt create mode 100644 src/mlpack/core/data/impute_strategies/custom_strategy.hpp create mode 100644 src/mlpack/core/data/impute_strategies/impute_mean.hpp create mode 100644 src/mlpack/core/data/impute_strategies/mode_strategy.hpp create mode 100644 src/mlpack/core/data/map_policies/CMakeLists.txt create mode 100644 src/mlpack/core/data/map_policies/default_map_policy.hpp create mode 100644 src/mlpack/core/data/map_policies/missing_map_policy.hpp diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index 8252b35be35..805b8a452c1 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -16,6 +16,7 @@ set(SOURCES serialization_shim.hpp split_data.hpp imputer.hpp + dataset_info_rich.hpp ) # add directory name to sources diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 4bfa55c8332..71ed93073c0 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -13,6 +13,8 @@ #include #include +#include "map_policies/default_map_policy.hpp" + namespace mlpack { namespace data { @@ -35,7 +37,8 @@ enum Datatype : bool /* bool is all the precision we need for two types */ * Datatype::categorical) as well as mappings from strings to unsigned integers * and vice versa. */ -class DatasetInfo +template +class DatasetMapper { public: /** @@ -43,7 +46,7 @@ class DatasetInfo * dimensionality cannot be changed later; you will have to create a new * DatasetInfo object. */ - DatasetInfo(const size_t dimensionality = 0); + DatasetMapper(const size_t dimensionality = 0); /** * Given the string and the dimension to which it belongs, return its numeric @@ -54,7 +57,8 @@ class DatasetInfo * @param string String to find/create mapping for. * @param dimension Index of the dimension of the string. */ - size_t MapString(const std::string& string, const size_t dimension); + typename MapPolicy::map_type_t MapString(const std::string& string, + const size_t dimension); /** * Return the string that corresponds to a given value in a given dimension. @@ -75,7 +79,8 @@ class DatasetInfo * @param string Mapped string for value. * @param dimension Dimension to unmap string from. */ - size_t UnmapValue(const std::string& string, const size_t dimension); + const typename MapPolicy::map_type_t UnmapValue(const std::string& string, + const size_t dimension) const; //! Return the type of a given dimension (numeric or categorical). Datatype Type(const size_t dimension) const; @@ -112,11 +117,18 @@ class DatasetInfo //! Mappings from strings to integers. Map entries will only exist for //! dimensions that are categorical. - std::unordered_map, - size_t>> maps; + typedef std::unordered_map, + size_t>> MapType; + + MapType maps; + MapPolicy policy; }; +using DatasetInfo = DatasetMapper; + } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index 9457d0d2b29..517e71356ff 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -2,7 +2,7 @@ * @file dataset_info_impl.hpp * @author Ryan Curtin * - * An implementation of the DatasetInfo class. + * An implementation of the DatasetMapper class. */ #ifndef MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP #define MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP @@ -14,46 +14,33 @@ namespace mlpack { namespace data { // Default constructor. -inline DatasetInfo::DatasetInfo(const size_t dimensionality) : +template +inline DatasetMapper::DatasetMapper(const size_t dimensionality) : types(dimensionality, Datatype::numeric) { // Nothing to initialize. } // Map the string to a numeric id. -inline size_t DatasetInfo::MapString(const std::string& string, - const size_t dimension) +template +inline typename MapPolicy::map_type_t DatasetMapper::MapString( + const std::string& string, + const size_t dimension) { - // If this condition is true, either we have no mapping for the given string - // or we have no mappings for the given dimension at all. In either case, - // we create a mapping. - if (maps.count(dimension) == 0 || - maps[dimension].first.left.count(string) == 0) - { - // This string does not exist yet. - size_t& numMappings = maps[dimension].second; - if (numMappings == 0) - types[dimension] = Datatype::categorical; - typedef boost::bimap::value_type PairType; - maps[dimension].first.insert(PairType(string, numMappings)); - return numMappings++; - } - else - { - // This string already exists in the mapping. - return maps[dimension].first.left.at(string); - } + return policy.template MapString(maps, string, dimension); } // Return the string corresponding to a value in a given dimension. -inline const std::string& DatasetInfo::UnmapString(const size_t value, - const size_t dimension) +template +inline const std::string& DatasetMapper::UnmapString( + const size_t value, + const size_t dimension) { // Throw an exception if the value doesn't exist. if (maps[dimension].first.right.count(value) == 0) { std::ostringstream oss; - oss << "DatasetInfo::UnmapString(): value '" << value << "' unknown for " + oss << "DatasetMapper::UnmapString(): value '" << value << "' unknown for " << "dimension " << dimension; throw std::invalid_argument(oss.str()); } @@ -62,15 +49,17 @@ inline const std::string& DatasetInfo::UnmapString(const size_t value, } // Return the value corresponding to a string in a given dimension. -inline size_t DatasetInfo::UnmapValue(const std::string& string, - const size_t dimension) +template +inline const typename MapPolicy::map_type_t DatasetMapper::UnmapValue( + const std::string& string, + const size_t dimension) const { // Throw an exception if the value doesn't exist. if (maps[dimension].first.left.count(string) == 0) { std::ostringstream oss; - oss << "DatasetInfo::UnmapValue(): string '" << string << "' unknown for " - << "dimension " << dimension; + oss << "DatasetMapper::UnmapValue(): string '" << string + << "' unknown for dimension " << dimension; throw std::invalid_argument(oss.str()); } @@ -78,7 +67,8 @@ inline size_t DatasetInfo::UnmapValue(const std::string& string, } // Get the type of a particular dimension. -inline Datatype DatasetInfo::Type(const size_t dimension) const +template +inline Datatype DatasetMapper::Type(const size_t dimension) const { if (dimension >= types.size()) { @@ -91,7 +81,8 @@ inline Datatype DatasetInfo::Type(const size_t dimension) const return types[dimension]; } -inline Datatype& DatasetInfo::Type(const size_t dimension) +template +inline Datatype& DatasetMapper::Type(const size_t dimension) { if (dimension >= types.size()) types.resize(dimension + 1, Datatype::numeric); @@ -99,12 +90,14 @@ inline Datatype& DatasetInfo::Type(const size_t dimension) return types[dimension]; } -inline size_t DatasetInfo::NumMappings(const size_t dimension) const +template +inline size_t DatasetMapper::NumMappings(const size_t dimension) const { return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second; } -inline size_t DatasetInfo::Dimensionality() const +template +inline size_t DatasetMapper::Dimensionality() const { return types.size(); } diff --git a/src/mlpack/core/data/dataset_info_rich.hpp b/src/mlpack/core/data/dataset_info_rich.hpp new file mode 100644 index 00000000000..a31d1c33a6e --- /dev/null +++ b/src/mlpack/core/data/dataset_info_rich.hpp @@ -0,0 +1,73 @@ +/** + * @file dataset_info.hpp + * @author Ryan Curtin + * + * Defines the DatasetInfo class, which holds information about a dataset. This + * is useful when the dataset contains categorical non-numeric features that + * needs to be mapped to categorical numeric features. + */ +#ifndef MLPACK_CORE_DATA_DATASET_INFO_RICH_HPP +#define MLPACK_CORE_DATA_DATASET_INFO_RICH_HPP + +#include +#include +#include "map_policies/default_map_policy.hpp" +#include + +namespace mlpack { +namespace data { + +template +class DatasetInfoRich +{ + public: + + DatasetInfoRich(const size_t dimensionality = 0): + types(dimensionality, Datatype::numeric) + { + // nothing to initialize + } + + typename MapPolicy::map_type_t MapString(const std::string& string, + const size_t dimension) + { + return policy.template MapString(maps, string, dimension); + } + + + // Return the value corresponding to a string in a given dimension. + typename MapPolicy::map_type_t UnmapValue(const std::string& string, + const size_t dimension) const + { + return 0; + } + + size_t NumMappings(const size_t dimension) const + { + return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second; + } + + private: + + //! Types of each dimension. + std::vector types; + + //! Mappings from strings to integers. Map entries will only exist for + //! dimensions that are categorical. + typedef std::unordered_map, + size_t>> MapType; + + MapType maps; + //using PairType = + //boost::bimap::value_type; + + MapPolicy policy; +}; + +using DefaultDatasetInfo = DatasetInfoRich; + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/impute_strategies/CMakeLists.txt b/src/mlpack/core/data/impute_strategies/CMakeLists.txt new file mode 100644 index 00000000000..aae4b59c696 --- /dev/null +++ b/src/mlpack/core/data/impute_strategies/CMakeLists.txt @@ -0,0 +1,14 @@ +# Define the files we need to compile +# Anything not in this list will not be compiled into mlpack. +set(SOURCES + impute_mean.hpp +) + +# Add directory name to sources. +set(DIR_SRCS) +foreach(file ${SOURCES}) + set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) +endforeach() +# Append sources (with directory name) to list of all mlpack sources (used at +# the parent scope). +set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) diff --git a/src/mlpack/core/data/impute_strategies/custom_strategy.hpp b/src/mlpack/core/data/impute_strategies/custom_strategy.hpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/mlpack/core/data/impute_strategies/impute_mean.hpp b/src/mlpack/core/data/impute_strategies/impute_mean.hpp new file mode 100644 index 00000000000..7cc3fd6b2e0 --- /dev/null +++ b/src/mlpack/core/data/impute_strategies/impute_mean.hpp @@ -0,0 +1,40 @@ +/** + * @file mean_strategy.hpp + * @author Keon Kim + * + * Defines the DatasetInfo class, which holds information about a dataset. This + * is useful when the dataset contains categorical non-numeric features that + * needs to be mapped to categorical numeric features. + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_MEAN_HPP +#define MLPACK_CORE_DATA_IMPUTE_MEAN_HPP + +#include + + +using namespace std; + +namespace mlpack { +namespace data { + +class ImputeMean +{ + public: + typedef size_t impute_type_t; + + template + void Impute(const arma::Mat &input, + arma::Mat &output, + const size_t dimension, + const size_t index) + { + output(dimension, index) = 99; + cout << "IMPUTE CALLED MEAN MAP POLICY" << endl; + + } +}; + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/impute_strategies/mode_strategy.hpp b/src/mlpack/core/data/impute_strategies/mode_strategy.hpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index babba012041..179b50294b4 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -14,54 +14,74 @@ namespace mlpack { namespace data { /** - * Given an input dataset, replace missing values with . + * This class implements a way to replace target values. It is dependent on the + * user defined Strategy and Mapper used to hold dataset's information. * - * @param input Input dataset to apply imputation. - * @param info DatasetInfo object that holds informations about the dataset. - * @param string User-defined missing value - * @param dimension. + * @tparam Option of imputation strategy. + * @tparam Mapper that is used to hold dataset information. + * @tparam primitive type of input and output's armadillo matrix. */ - -template -void Imputer(arma::Mat& input, - DatasetInfo& info, - const std::string& missingValue, - const size_t dimension, - const std::string& strategy) +template +class Imputer { - Log::Info << "impute using " << strategy << " strategy" << std::endl; - - size_t mappedValue = info.UnmapValue(missingValue, dimension); - arma::mat stats; - - if (strategy == "mean") - { - stats = arma::mean(input); // mean of columns - } - else if (strategy == "median") + private: + Strategy strat; + public: + Imputer() { - stats = arma::median(input); + // nothing to initialize } - for (size_t i = 0; i < input.n_cols; ++i) + /** + * Given an input dataset, replace missing values with given imputation + * strategy. + * + * @param input Input dataset to apply imputation. + * @param output + * @oaran targetValue + * @param mapper DatasetInfo object that holds informations about the dataset. + * @param dimension. + * @param transpose. + */ + void Impute(const arma::Mat &input, + arma::Mat &output, + const Mapper &mapper, + const std::string &targetValue, + const size_t dimension, + const bool transpose = true) { - if (input(dimension, i) == mappedValue) + auto mappedValue = mapper.UnmapValue(targetValue, dimension); + Log::Info << "<>" << std::endl; + Log::Info << "<>mapped value<>: " << mappedValue << std::endl; + if(transpose) + { + output.set_size(input.n_rows, input.n_cols); + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(dimension, i) == 0.0) + { + // users can specify the imputation strategies likes + // mean, mode, etc using the class'es template parameter: Strategy. + Log::Info << "<>" << std::endl; + strat.template Impute(input, output, dimension, i); + } + } + } + else { - // just for demo, - input(dimension, i) = stats(0, i); + output.set_size(input.n_cols, input.n_rows); + for (size_t i = 0; i < input.n_cols; ++i) + { + if (input(i, dimension) == mappedValue) + { + Log::Info << "<>" << std::endl; + strat.template Impute(input, output, i, dimension); + } + } } + Log::Info << "" << std::endl; } -} - -template -void Imputer(arma::Mat& input, - DatasetInfo& info, - const std::string& missingValue, - const size_t dimension) -{ - std::string strategy = "mean"; // default strategy - Imputer(input, info, missingValue, dimension, strategy); -} +}; // class Imputer } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/map_policies/CMakeLists.txt b/src/mlpack/core/data/map_policies/CMakeLists.txt new file mode 100644 index 00000000000..c008f119d28 --- /dev/null +++ b/src/mlpack/core/data/map_policies/CMakeLists.txt @@ -0,0 +1,15 @@ +# Define the files we need to compile +# Anything not in this list will not be compiled into mlpack. +set(SOURCES + default_map_policy.hpp + missing_map_policy.hpp +) + +# Add directory name to sources. +set(DIR_SRCS) +foreach(file ${SOURCES}) + set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) +endforeach() +# Append sources (with directory name) to list of all mlpack sources (used at +# the parent scope). +set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) diff --git a/src/mlpack/core/data/map_policies/default_map_policy.hpp b/src/mlpack/core/data/map_policies/default_map_policy.hpp new file mode 100644 index 00000000000..da670aa275c --- /dev/null +++ b/src/mlpack/core/data/map_policies/default_map_policy.hpp @@ -0,0 +1,60 @@ +/** + * @file increment_num_policy.hpp + * @author Keon Kim + * + * Default increment num policy for dataset info. + */ +#ifndef MLPACK_CORE_DATA_MAP_POLICY_HPP +#define MLPACK_CORE_DATA_MAP_POLICY_HPP + +#include +#include +#include + + +using namespace std; + +namespace mlpack { +namespace data { + +/** + * This class is used to map strings to incrementing unsigned integers (size_t). + * First string to be mapped will be mapped to 0, next to 1 and so on. + */ +class DefaultMapPolicy +{ + public: + typedef size_t map_type_t; + + template + map_type_t MapString(MapType& maps, + const std::string& string, + const size_t dimension) + { + Log::Info << "DEFAULT MAP POLICY MAPSTRING" << string << dimension << endl; + // If this condition is true, either we have no mapping for the given string + // or we have no mappings for the given dimension at all. In either case, + // we create a mapping. + if (maps.count(dimension) == 0 || + maps[dimension].first.left.count(string) == 0) + { + // This string does not exist yet. + size_t& numMappings = maps[dimension].second; + //if (numMappings == 0) + //types[dimension] = Datatype::categorical; + typedef boost::bimap::value_type PairType; + maps[dimension].first.insert(PairType(string, numMappings)); + return numMappings++; + } + else + { + // This string already exists in the mapping. + return maps[dimension].first.left.at(string); + } + } +}; // class DefaultMapPolicy + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/map_policies/missing_map_policy.hpp b/src/mlpack/core/data/map_policies/missing_map_policy.hpp new file mode 100644 index 00000000000..c5bc0a67834 --- /dev/null +++ b/src/mlpack/core/data/map_policies/missing_map_policy.hpp @@ -0,0 +1,60 @@ +/** + * @file missing_val_policy.hpp + * @author Keon Kim + * + * Default increment num policy for dataset info. + */ +#ifndef MLPACK_CORE_DATA_MISSING_VAL_POLICY_HPP +#define MLPACK_CORE_DATA_MISSING_VAL_POLICY_HPP + +#include +#include +#include + + +using namespace std; + +namespace mlpack { +namespace data { + +/** + * This class is used to map strings to incrementing unsigned integers (size_t). + * First string to be mapped will be mapped to 0, next to 1 and so on. + */ +class MissingValPolicy +{ + public: + typedef size_t map_type_t; + + template + map_type_t MapString(MapType& maps, + const std::string& string, + const size_t dimension) + { + Log::Info << "DEFAULT MAP POLICY MAPSTRING" << string << dimension << endl; + // If this condition is true, either we have no mapping for the given string + // or we have no mappings for the given dimension at all. In either case, + // we create a mapping. + if (maps.count(dimension) == 0 || + maps[dimension].first.left.count(string) == 0) + { + // This string does not exist yet. + size_t& numMappings = maps[dimension].second; + //if (numMappings == 0) + //types[dimension] = Datatype::categorical; + typedef boost::bimap::value_type PairType; + maps[dimension].first.insert(PairType(string, numMappings)); + return numMappings++; + } + else + { + // This string already exists in the mapping. + return maps[dimension].first.left.at(string); + } + } +}; // class DefaultMapPolicy + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index f549b26473e..6752e31e60e 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -7,6 +7,9 @@ */ #include #include +#include +#include +#include PROGRAM_INFO("Imputer", "This " "utility takes an any type of data and provides " @@ -29,28 +32,64 @@ int main(int argc, char** argv) const string inputFile = CLI::GetParam("input_file"); const string missingValue = CLI::GetParam("missing_value"); const string outputFile = CLI::GetParam("output_file"); - const size_t featureNumber = (size_t) CLI::GetParam("feature"); + //const size_t featureNumber = (size_t) CLI::GetParam("feature"); arma::mat data; data::DatasetInfo info; data::Load(inputFile, data, info, true, false); - Log::Debug << "" << endl; - Log::Info << data << endl; + //Log::Debug << "" << endl; + //Log::Info << data << endl; + + //Log::Info << "dataset info: " << endl; + //for (size_t i = 0; i < data.n_rows; ++i) + //{ + //Log::Info << info.NumMappings(i) << " mappings in dimension " + //<< i << "." << endl; + //} + + //Log::Info << "Loading feature: " << featureNumber << endl; + //data::Imputer(data, info, missingValue, featureNumber); + + //Log::Debug << "" << endl; + //Log::Info << data << endl; +/****************************/ + + Log::Info << "<><><><>Start<><><><>" << endl; + + arma::Mat input(data); + arma::Mat output; + //data::DefaultMapPolicy policy; + std::string missValue = "hello"; + data::DatasetInfo richinfo(input.n_rows); + size_t dimension = 0; + + Log::Info << input << endl; + + Log::Info << "hello is mapped to: "<< richinfo.MapString("hello", dimension) << endl; + Log::Info << "dude is mapped to" << richinfo.MapString("dude", dimension) << endl; - Log::Info << "dataset info: " << endl; for (size_t i = 0; i < data.n_rows; ++i) { - Log::Info << info.NumMappings(i) << " mappings in dimension " + Log::Info << richinfo.NumMappings(i) << " mappings in dimension " << i << "." << endl; } - Log::Info << "Loading feature: " << featureNumber << endl; - data::Imputer(data, info, missingValue, featureNumber); + data::Imputer< + data::ImputeMean, + data::DatasetInfo, + double> impu; + + impu.Impute(input, output, richinfo, missValue, dimension); + + Log::Info << "input::" << endl; + Log::Info << input << endl; + Log::Info << "output::" << endl; + Log::Info << output << endl; - Log::Debug << "" << endl; - Log::Info << data << endl; + Log::Info << "<><><><>END<><><><>" << endl; +/****************************/ if (!outputFile.empty()) { From b0c52242be45468d0bb2e7f6b8c0d5e174199307 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 13 Jun 2016 09:20:07 +0900 Subject: [PATCH 04/40] clean datasetinfo class and rename files --- src/mlpack/core/data/CMakeLists.txt | 1 - src/mlpack/core/data/dataset_info.hpp | 6 +- src/mlpack/core/data/dataset_info_impl.hpp | 21 +++--- src/mlpack/core/data/dataset_info_rich.hpp | 73 ------------------- .../data/impute_strategies/CMakeLists.txt | 5 +- .../impute_strategies/custom_strategy.hpp | 37 ++++++++++ .../{impute_mean.hpp => mean_strategy.hpp} | 9 +-- .../impute_strategies/median_strategy.hpp | 38 ++++++++++ .../data/impute_strategies/mode_strategy.hpp | 37 ++++++++++ src/mlpack/core/data/imputer.hpp | 7 +- .../core/data/map_policies/CMakeLists.txt | 2 +- ...ap_policy.hpp => increment_map_policy.hpp} | 6 +- .../data/map_policies/missing_map_policy.hpp | 14 ++-- .../preprocess/preprocess_imputer_main.cpp | 9 ++- 14 files changed, 155 insertions(+), 110 deletions(-) delete mode 100644 src/mlpack/core/data/dataset_info_rich.hpp rename src/mlpack/core/data/impute_strategies/{impute_mean.hpp => mean_strategy.hpp} (62%) create mode 100644 src/mlpack/core/data/impute_strategies/median_strategy.hpp rename src/mlpack/core/data/map_policies/{default_map_policy.hpp => increment_map_policy.hpp} (93%) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index 805b8a452c1..8252b35be35 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -16,7 +16,6 @@ set(SOURCES serialization_shim.hpp split_data.hpp imputer.hpp - dataset_info_rich.hpp ) # add directory name to sources diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 71ed93073c0..06fa6bc8c80 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -13,7 +13,7 @@ #include #include -#include "map_policies/default_map_policy.hpp" +#include "map_policies/increment_map_policy.hpp" namespace mlpack { namespace data { @@ -79,7 +79,7 @@ class DatasetMapper * @param string Mapped string for value. * @param dimension Dimension to unmap string from. */ - const typename MapPolicy::map_type_t UnmapValue(const std::string& string, + typename MapPolicy::map_type_t UnmapValue(const std::string& string, const size_t dimension) const; //! Return the type of a given dimension (numeric or categorical). @@ -127,7 +127,7 @@ class DatasetMapper MapPolicy policy; }; -using DatasetInfo = DatasetMapper; +using DatasetInfo = DatasetMapper; } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index 517e71356ff..9e94f6acbd8 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -50,20 +50,21 @@ inline const std::string& DatasetMapper::UnmapString( // Return the value corresponding to a string in a given dimension. template -inline const typename MapPolicy::map_type_t DatasetMapper::UnmapValue( +inline typename MapPolicy::map_type_t DatasetMapper::UnmapValue( const std::string& string, const size_t dimension) const { + return 0; // Throw an exception if the value doesn't exist. - if (maps[dimension].first.left.count(string) == 0) - { - std::ostringstream oss; - oss << "DatasetMapper::UnmapValue(): string '" << string - << "' unknown for dimension " << dimension; - throw std::invalid_argument(oss.str()); - } - - return maps[dimension].first.left.at(string); + //if (maps[dimension].first.left.count(string) == 0) + //{ + //std::ostringstream oss; + //oss << "DatasetMapper::UnmapValue(): string '" << string + //<< "' unknown for dimension " << dimension; + //throw std::invalid_argument(oss.str()); + //} + + //return maps[dimension].first.left.at(string); } // Get the type of a particular dimension. diff --git a/src/mlpack/core/data/dataset_info_rich.hpp b/src/mlpack/core/data/dataset_info_rich.hpp deleted file mode 100644 index a31d1c33a6e..00000000000 --- a/src/mlpack/core/data/dataset_info_rich.hpp +++ /dev/null @@ -1,73 +0,0 @@ -/** - * @file dataset_info.hpp - * @author Ryan Curtin - * - * Defines the DatasetInfo class, which holds information about a dataset. This - * is useful when the dataset contains categorical non-numeric features that - * needs to be mapped to categorical numeric features. - */ -#ifndef MLPACK_CORE_DATA_DATASET_INFO_RICH_HPP -#define MLPACK_CORE_DATA_DATASET_INFO_RICH_HPP - -#include -#include -#include "map_policies/default_map_policy.hpp" -#include - -namespace mlpack { -namespace data { - -template -class DatasetInfoRich -{ - public: - - DatasetInfoRich(const size_t dimensionality = 0): - types(dimensionality, Datatype::numeric) - { - // nothing to initialize - } - - typename MapPolicy::map_type_t MapString(const std::string& string, - const size_t dimension) - { - return policy.template MapString(maps, string, dimension); - } - - - // Return the value corresponding to a string in a given dimension. - typename MapPolicy::map_type_t UnmapValue(const std::string& string, - const size_t dimension) const - { - return 0; - } - - size_t NumMappings(const size_t dimension) const - { - return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second; - } - - private: - - //! Types of each dimension. - std::vector types; - - //! Mappings from strings to integers. Map entries will only exist for - //! dimensions that are categorical. - typedef std::unordered_map, - size_t>> MapType; - - MapType maps; - //using PairType = - //boost::bimap::value_type; - - MapPolicy policy; -}; - -using DefaultDatasetInfo = DatasetInfoRich; - -} // namespace data -} // namespace mlpack - -#endif diff --git a/src/mlpack/core/data/impute_strategies/CMakeLists.txt b/src/mlpack/core/data/impute_strategies/CMakeLists.txt index aae4b59c696..ae3c9b9c66c 100644 --- a/src/mlpack/core/data/impute_strategies/CMakeLists.txt +++ b/src/mlpack/core/data/impute_strategies/CMakeLists.txt @@ -1,7 +1,10 @@ # Define the files we need to compile # Anything not in this list will not be compiled into mlpack. set(SOURCES - impute_mean.hpp + mean_strategy.hpp + mode_strategy.hpp + median_strategy.hpp + custom_strategy.hpp ) # Add directory name to sources. diff --git a/src/mlpack/core/data/impute_strategies/custom_strategy.hpp b/src/mlpack/core/data/impute_strategies/custom_strategy.hpp index e69de29bb2d..4bfbedd69fd 100644 --- a/src/mlpack/core/data/impute_strategies/custom_strategy.hpp +++ b/src/mlpack/core/data/impute_strategies/custom_strategy.hpp @@ -0,0 +1,37 @@ +/** + * @file custom_strategy.hpp + * @author Keon Kim + * + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_STRATEGY_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_STRATEGY_HPP + +#include + + +using namespace std; + +namespace mlpack { +namespace data { + +class CustomStrategy +{ + public: + typedef size_t impute_type_t; + + template + void Impute(const arma::Mat &input, + arma::Mat &output, + const size_t dimension, + const size_t index) + { + output(dimension, index) = 99; + cout << "IMPUTE CALLED CUSTOM MAP STRATEGY" << endl; + + } +}; + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/impute_strategies/impute_mean.hpp b/src/mlpack/core/data/impute_strategies/mean_strategy.hpp similarity index 62% rename from src/mlpack/core/data/impute_strategies/impute_mean.hpp rename to src/mlpack/core/data/impute_strategies/mean_strategy.hpp index 7cc3fd6b2e0..3eac943b2ef 100644 --- a/src/mlpack/core/data/impute_strategies/impute_mean.hpp +++ b/src/mlpack/core/data/impute_strategies/mean_strategy.hpp @@ -2,12 +2,9 @@ * @file mean_strategy.hpp * @author Keon Kim * - * Defines the DatasetInfo class, which holds information about a dataset. This - * is useful when the dataset contains categorical non-numeric features that - * needs to be mapped to categorical numeric features. */ -#ifndef MLPACK_CORE_DATA_IMPUTE_MEAN_HPP -#define MLPACK_CORE_DATA_IMPUTE_MEAN_HPP +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_STRATEGY_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_STRATEGY_HPP #include @@ -17,7 +14,7 @@ using namespace std; namespace mlpack { namespace data { -class ImputeMean +class MeanStrategy { public: typedef size_t impute_type_t; diff --git a/src/mlpack/core/data/impute_strategies/median_strategy.hpp b/src/mlpack/core/data/impute_strategies/median_strategy.hpp new file mode 100644 index 00000000000..b5b183730c4 --- /dev/null +++ b/src/mlpack/core/data/impute_strategies/median_strategy.hpp @@ -0,0 +1,38 @@ +/** + * @file median_strategy.hpp + * @author Keon Kim + * + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEDIAN_STRATEGY_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEDIAN_STRATEGY_HPP + + +#include + + +using namespace std; + +namespace mlpack { +namespace data { + +class MedianStrategy +{ + public: + typedef size_t impute_type_t; + + template + void Impute(const arma::Mat &input, + arma::Mat &output, + const size_t dimension, + const size_t index) + { + output(dimension, index) = 99; + cout << "IMPUTE CALLED MEDIAN MAP POLICY" << endl; + + } +}; + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/impute_strategies/mode_strategy.hpp b/src/mlpack/core/data/impute_strategies/mode_strategy.hpp index e69de29bb2d..15934974628 100644 --- a/src/mlpack/core/data/impute_strategies/mode_strategy.hpp +++ b/src/mlpack/core/data/impute_strategies/mode_strategy.hpp @@ -0,0 +1,37 @@ +/** + * @file mode_strategy.hpp + * @author Keon Kim + * + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MODE_STRATEGY_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MODE_STRATEGY_HPP + +#include + + +using namespace std; + +namespace mlpack { +namespace data { + +class ModeStrategy +{ + public: + typedef size_t impute_type_t; + + template + void Impute(const arma::Mat &input, + arma::Mat &output, + const size_t dimension, + const size_t index) + { + output(dimension, index) = 99; + cout << "IMPUTE CALLED CUSTOM MAP STRATEGY" << endl; + + } +}; + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index 179b50294b4..d09a85fc63c 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -58,13 +58,18 @@ class Imputer output.set_size(input.n_rows, input.n_cols); for (size_t i = 0; i < input.n_rows; ++i) { - if (input(dimension, i) == 0.0) + Log::Info << " input=> " << input(dimension, i) << " mappedValue=> "<< mappedValue << std::endl; + if (input(dimension, i) == mappedValue) { // users can specify the imputation strategies likes // mean, mode, etc using the class'es template parameter: Strategy. Log::Info << "<>" << std::endl; strat.template Impute(input, output, dimension, i); } + else + { + Log::Info << "" << std::endl; + } } } else diff --git a/src/mlpack/core/data/map_policies/CMakeLists.txt b/src/mlpack/core/data/map_policies/CMakeLists.txt index c008f119d28..6cfb0ee041c 100644 --- a/src/mlpack/core/data/map_policies/CMakeLists.txt +++ b/src/mlpack/core/data/map_policies/CMakeLists.txt @@ -1,7 +1,7 @@ # Define the files we need to compile # Anything not in this list will not be compiled into mlpack. set(SOURCES - default_map_policy.hpp + increment_map_policy.hpp missing_map_policy.hpp ) diff --git a/src/mlpack/core/data/map_policies/default_map_policy.hpp b/src/mlpack/core/data/map_policies/increment_map_policy.hpp similarity index 93% rename from src/mlpack/core/data/map_policies/default_map_policy.hpp rename to src/mlpack/core/data/map_policies/increment_map_policy.hpp index da670aa275c..5c2708ec18f 100644 --- a/src/mlpack/core/data/map_policies/default_map_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_map_policy.hpp @@ -1,8 +1,8 @@ /** - * @file increment_num_policy.hpp + * @file increment_map_policy.hpp * @author Keon Kim * - * Default increment num policy for dataset info. + * Default increment map policy for dataset info. */ #ifndef MLPACK_CORE_DATA_MAP_POLICY_HPP #define MLPACK_CORE_DATA_MAP_POLICY_HPP @@ -21,7 +21,7 @@ namespace data { * This class is used to map strings to incrementing unsigned integers (size_t). * First string to be mapped will be mapped to 0, next to 1 and so on. */ -class DefaultMapPolicy +class IncrementMapPolicy { public: typedef size_t map_type_t; diff --git a/src/mlpack/core/data/map_policies/missing_map_policy.hpp b/src/mlpack/core/data/map_policies/missing_map_policy.hpp index c5bc0a67834..41a6487378b 100644 --- a/src/mlpack/core/data/map_policies/missing_map_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_map_policy.hpp @@ -1,11 +1,11 @@ /** - * @file missing_val_policy.hpp + * @file missing_map_policy.hpp * @author Keon Kim * - * Default increment num policy for dataset info. + * Missing map policy for dataset info. */ -#ifndef MLPACK_CORE_DATA_MISSING_VAL_POLICY_HPP -#define MLPACK_CORE_DATA_MISSING_VAL_POLICY_HPP +#ifndef MLPACK_CORE_DATA_MISSING_MAP_POLICY_HPP +#define MLPACK_CORE_DATA_MISSING_MAP_POLICY_HPP #include #include @@ -21,7 +21,7 @@ namespace data { * This class is used to map strings to incrementing unsigned integers (size_t). * First string to be mapped will be mapped to 0, next to 1 and so on. */ -class MissingValPolicy +class MissingMapPolicy { public: typedef size_t map_type_t; @@ -31,7 +31,7 @@ class MissingValPolicy const std::string& string, const size_t dimension) { - Log::Info << "DEFAULT MAP POLICY MAPSTRING" << string << dimension << endl; + Log::Info << "MISSING MAP POLICY MAPSTRING" << string << dimension << endl; // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. @@ -52,7 +52,7 @@ class MissingValPolicy return maps[dimension].first.left.at(string); } } -}; // class DefaultMapPolicy +}; // class IncrementMapPolicy } // namespace data } // namespace mlpack diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 6752e31e60e..bfcb14c93e7 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -8,8 +8,8 @@ #include #include #include -#include -#include +#include +#include PROGRAM_INFO("Imputer", "This " "utility takes an any type of data and provides " @@ -58,7 +58,8 @@ int main(int argc, char** argv) Log::Info << "<><><><>Start<><><><>" << endl; arma::Mat input(data); - arma::Mat output; + arma::Mat output(data); + //data::DefaultMapPolicy policy; std::string missValue = "hello"; data::DatasetInfo richinfo(input.n_rows); @@ -76,7 +77,7 @@ int main(int argc, char** argv) } data::Imputer< - data::ImputeMean, + data::MeanStrategy, data::DatasetInfo, double> impu; From de35241d475ef16bddfa4b1e475f80ef64a07f34 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 13 Jun 2016 16:16:53 +0900 Subject: [PATCH 05/40] implement basic imputation strategies --- src/mlpack/core/data/dataset_info.hpp | 5 +- src/mlpack/core/data/dataset_info_impl.hpp | 8 ++- .../impute_strategies/custom_strategy.hpp | 14 +--- .../data/impute_strategies/mean_strategy.hpp | 25 ++++--- .../impute_strategies/median_strategy.hpp | 23 ++++--- .../data/impute_strategies/mode_strategy.hpp | 3 +- src/mlpack/core/data/imputer.hpp | 68 +++++++++++++++++-- .../map_policies/increment_map_policy.hpp | 9 ++- .../data/map_policies/missing_map_policy.hpp | 7 +- .../preprocess/preprocess_imputer_main.cpp | 57 ++++------------ 10 files changed, 125 insertions(+), 94 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 06fa6bc8c80..2d60852f681 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -36,6 +36,8 @@ enum Datatype : bool /* bool is all the precision we need for two types */ * by data::Load(), and store the type of each dimension (Datatype::numeric or * Datatype::categorical) as well as mappings from strings to unsigned integers * and vice versa. + * + * @tparam MapPolicy Mapping policy used to specify MapString(); */ template class DatasetMapper @@ -57,7 +59,7 @@ class DatasetMapper * @param string String to find/create mapping for. * @param dimension Index of the dimension of the string. */ - typename MapPolicy::map_type_t MapString(const std::string& string, + typename MapPolicy::map_type_t MapString(const std::string& string, const size_t dimension); /** @@ -127,6 +129,7 @@ class DatasetMapper MapPolicy policy; }; +// Use typedef to provide backward compatibility using DatasetInfo = DatasetMapper; } // namespace data diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index 9e94f6acbd8..33ea5ef0b68 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -21,7 +21,9 @@ inline DatasetMapper::DatasetMapper(const size_t dimensionality) : // Nothing to initialize. } -// Map the string to a numeric id. + +// When we want to insert value into the map, +// we could use the policy to map the string template inline typename MapPolicy::map_type_t DatasetMapper::MapString( const std::string& string, @@ -40,8 +42,8 @@ inline const std::string& DatasetMapper::UnmapString( if (maps[dimension].first.right.count(value) == 0) { std::ostringstream oss; - oss << "DatasetMapper::UnmapString(): value '" << value << "' unknown for " - << "dimension " << dimension; + oss << "DatasetMapper::UnmapString(): value '" << value + << "' unknown for dimension " << dimension; throw std::invalid_argument(oss.str()); } diff --git a/src/mlpack/core/data/impute_strategies/custom_strategy.hpp b/src/mlpack/core/data/impute_strategies/custom_strategy.hpp index 4bfbedd69fd..e12379c21fe 100644 --- a/src/mlpack/core/data/impute_strategies/custom_strategy.hpp +++ b/src/mlpack/core/data/impute_strategies/custom_strategy.hpp @@ -16,19 +16,7 @@ namespace data { class CustomStrategy { - public: - typedef size_t impute_type_t; - - template - void Impute(const arma::Mat &input, - arma::Mat &output, - const size_t dimension, - const size_t index) - { - output(dimension, index) = 99; - cout << "IMPUTE CALLED CUSTOM MAP STRATEGY" << endl; - - } + // empty class }; } // namespace data diff --git a/src/mlpack/core/data/impute_strategies/mean_strategy.hpp b/src/mlpack/core/data/impute_strategies/mean_strategy.hpp index 3eac943b2ef..b24eb236213 100644 --- a/src/mlpack/core/data/impute_strategies/mean_strategy.hpp +++ b/src/mlpack/core/data/impute_strategies/mean_strategy.hpp @@ -17,18 +17,25 @@ namespace data { class MeanStrategy { public: - typedef size_t impute_type_t; - template - void Impute(const arma::Mat &input, - arma::Mat &output, + template + void Impute(const MatType &input, + MatType &output, const size_t dimension, - const size_t index) + const size_t index, + const bool transpose = true) { - output(dimension, index) = 99; - cout << "IMPUTE CALLED MEAN MAP POLICY" << endl; - - } + if (transpose) + { + MatType meanMat = arma::mean(input, 1); + output(dimension, index) = meanMat(dimension); + } + else + { + MatType meanMat = arma::mean(input, 0); + output(index, dimension) = meanMat(index); + } + } }; } // namespace data diff --git a/src/mlpack/core/data/impute_strategies/median_strategy.hpp b/src/mlpack/core/data/impute_strategies/median_strategy.hpp index b5b183730c4..d9de2aefd03 100644 --- a/src/mlpack/core/data/impute_strategies/median_strategy.hpp +++ b/src/mlpack/core/data/impute_strategies/median_strategy.hpp @@ -18,17 +18,24 @@ namespace data { class MedianStrategy { public: - typedef size_t impute_type_t; - template - void Impute(const arma::Mat &input, - arma::Mat &output, + template + void Impute(const MatType &input, + MatType &output, const size_t dimension, - const size_t index) + const size_t index, + const bool transpose = true) { - output(dimension, index) = 99; - cout << "IMPUTE CALLED MEDIAN MAP POLICY" << endl; - + if (transpose) + { + MatType medianMat = arma::median(input, 1); + output(dimension, index) = medianMat(dimension); + } + else + { + MatType medianMat = arma::median(input, 0); + output(index, dimension) = medianMat(index); + } } }; diff --git a/src/mlpack/core/data/impute_strategies/mode_strategy.hpp b/src/mlpack/core/data/impute_strategies/mode_strategy.hpp index 15934974628..7f1416136e3 100644 --- a/src/mlpack/core/data/impute_strategies/mode_strategy.hpp +++ b/src/mlpack/core/data/impute_strategies/mode_strategy.hpp @@ -17,14 +17,13 @@ namespace data { class ModeStrategy { public: - typedef size_t impute_type_t; - template void Impute(const arma::Mat &input, arma::Mat &output, const size_t dimension, const size_t index) { + // considering use of arma::hist() output(dimension, index) = 99; cout << "IMPUTE CALLED CUSTOM MAP STRATEGY" << endl; diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index d09a85fc63c..f67dc6c9e59 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -21,7 +21,7 @@ namespace data { * @tparam Mapper that is used to hold dataset information. * @tparam primitive type of input and output's armadillo matrix. */ -template +template class Imputer { private: @@ -43,8 +43,8 @@ class Imputer * @param dimension. * @param transpose. */ - void Impute(const arma::Mat &input, - arma::Mat &output, + void Impute(const MatType &input, + MatType &output, const Mapper &mapper, const std::string &targetValue, const size_t dimension, @@ -55,7 +55,6 @@ class Imputer Log::Info << "<>mapped value<>: " << mappedValue << std::endl; if(transpose) { - output.set_size(input.n_rows, input.n_cols); for (size_t i = 0; i < input.n_rows; ++i) { Log::Info << " input=> " << input(dimension, i) << " mappedValue=> "<< mappedValue << std::endl; @@ -64,7 +63,7 @@ class Imputer // users can specify the imputation strategies likes // mean, mode, etc using the class'es template parameter: Strategy. Log::Info << "<>" << std::endl; - strat.template Impute(input, output, dimension, i); + strat.template Impute(input, output, dimension, i, transpose); } else { @@ -74,18 +73,73 @@ class Imputer } else { - output.set_size(input.n_cols, input.n_rows); for (size_t i = 0; i < input.n_cols; ++i) { + Log::Info << " input=> " << input(dimension, i) << " mappedValue=> "<< mappedValue << std::endl; if (input(i, dimension) == mappedValue) { Log::Info << "<>" << std::endl; - strat.template Impute(input, output, i, dimension); + strat.template Impute(input, output, i, dimension, transpose); + } + else { + Log::Info << "" << std::endl; } } } Log::Info << "" << std::endl; } + + /** + * This overload of Impute() lets users to define custom value that + * can be replaced with the target value. + */ + template + void Impute(const arma::Mat &input, + arma::Mat &output, + const Mapper &mapper, + const std::string &targetValue, + const T &customValue, + const size_t dimension, + const bool transpose = true) + { + auto mappedValue = mapper.UnmapValue(targetValue, dimension); + Log::Info << "<>" << std::endl; + Log::Info << "<>mapped value<>: " << mappedValue << std::endl; + if(transpose) + { + for (size_t i = 0; i < input.n_rows; ++i) + { + Log::Info << " input=> " << input(dimension, i) << " mappedValue=> "<< mappedValue << std::endl; + if (input(dimension, i) == mappedValue) + { + // replace the target value to custom value + Log::Info << "<>" << std::endl; + output(dimension, i) = customValue; + } + else + { + Log::Info << "" << std::endl; + } + } + } + else + { + for (size_t i = 0; i < input.n_cols; ++i) + { + Log::Info << " input=> " << input(dimension, i) << " mappedValue=> "<< mappedValue << std::endl; + if (input(i, dimension) == mappedValue) + { + Log::Info << "<>" << std::endl; + output(i, dimension) = customValue; + } + else { + Log::Info << "" << std::endl; + } + } + } + Log::Info << "" << std::endl; + } + }; // class Imputer } // namespace data diff --git a/src/mlpack/core/data/map_policies/increment_map_policy.hpp b/src/mlpack/core/data/map_policies/increment_map_policy.hpp index 5c2708ec18f..e0ee6687f48 100644 --- a/src/mlpack/core/data/map_policies/increment_map_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_map_policy.hpp @@ -4,8 +4,8 @@ * * Default increment map policy for dataset info. */ -#ifndef MLPACK_CORE_DATA_MAP_POLICY_HPP -#define MLPACK_CORE_DATA_MAP_POLICY_HPP +#ifndef MLPACK_CORE_DATA_MAP_POLICIES_INCREMENT_MAP_POLICY_HPP +#define MLPACK_CORE_DATA_MAP_POLICIES_INCREMENT_MAP_POLICY_HPP #include #include @@ -19,7 +19,7 @@ namespace data { /** * This class is used to map strings to incrementing unsigned integers (size_t). - * First string to be mapped will be mapped to 0, next to 1 and so on. + * First string to be mapped will be mapped to 0, next to 1, 2, and so on. */ class IncrementMapPolicy { @@ -31,7 +31,6 @@ class IncrementMapPolicy const std::string& string, const size_t dimension) { - Log::Info << "DEFAULT MAP POLICY MAPSTRING" << string << dimension << endl; // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. @@ -52,7 +51,7 @@ class IncrementMapPolicy return maps[dimension].first.left.at(string); } } -}; // class DefaultMapPolicy +}; // class IncrementMapPolicy } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/map_policies/missing_map_policy.hpp b/src/mlpack/core/data/map_policies/missing_map_policy.hpp index 41a6487378b..81d5be31fcb 100644 --- a/src/mlpack/core/data/map_policies/missing_map_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_map_policy.hpp @@ -4,8 +4,8 @@ * * Missing map policy for dataset info. */ -#ifndef MLPACK_CORE_DATA_MISSING_MAP_POLICY_HPP -#define MLPACK_CORE_DATA_MISSING_MAP_POLICY_HPP +#ifndef MLPACK_CORE_DATA_MAP_POLICIES_MISSING_MAP_POLICY_HPP +#define MLPACK_CORE_DATA_MAP_POLICIES_MISSING_MAP_POLICY_HPP #include #include @@ -18,8 +18,7 @@ namespace mlpack { namespace data { /** - * This class is used to map strings to incrementing unsigned integers (size_t). - * First string to be mapped will be mapped to 0, next to 1 and so on. + * Same as increment map policy so far. */ class MissingMapPolicy { diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index bfcb14c93e7..145acce6146 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -16,9 +16,13 @@ PROGRAM_INFO("Imputer", "This " "imputation strategies for missing data."); PARAM_STRING_REQ("input_file", "File containing data,", "i"); -PARAM_STRING("missing_value", "User defined missing value", "m", "") -PARAM_INT("feature", "the feature to be analyzed", "f", 0); PARAM_STRING("output_file", "File to save output", "o", ""); +PARAM_STRING("missing_value", "User defined missing value", "m", "") +PARAM_STRING("map_policy", "mapping policy to be used while loading", "p", "") +PARAM_STRING("map_to", "custom_strategy option. map to something else", "t", "") +PARAM_STRING("impute_strategy", "imputation strategy to be applied", "s", "") +PARAM_DOUBLE("custom_value", "user_defined custom value", "c", "") +PARAM_INT("feature", "the feature to apply imputation", "f", 0); using namespace mlpack; using namespace arma; @@ -32,70 +36,39 @@ int main(int argc, char** argv) const string inputFile = CLI::GetParam("input_file"); const string missingValue = CLI::GetParam("missing_value"); const string outputFile = CLI::GetParam("output_file"); - //const size_t featureNumber = (size_t) CLI::GetParam("feature"); + const size_t feature = (size_t) CLI::GetParam("feature"); - arma::mat data; + arma::mat input; data::DatasetInfo info; - data::Load(inputFile, data, info, true, false); - //Log::Debug << "" << endl; - //Log::Info << data << endl; - - //Log::Info << "dataset info: " << endl; - //for (size_t i = 0; i < data.n_rows; ++i) - //{ - //Log::Info << info.NumMappings(i) << " mappings in dimension " - //<< i << "." << endl; - //} - - //Log::Info << "Loading feature: " << featureNumber << endl; - //data::Imputer(data, info, missingValue, featureNumber); - - //Log::Debug << "" << endl; - //Log::Info << data << endl; -/****************************/ - - Log::Info << "<><><><>Start<><><><>" << endl; - - arma::Mat input(data); - arma::Mat output(data); - - //data::DefaultMapPolicy policy; - std::string missValue = "hello"; - data::DatasetInfo richinfo(input.n_rows); - size_t dimension = 0; + data::Load(inputFile, input, info, true, true); Log::Info << input << endl; - Log::Info << "hello is mapped to: "<< richinfo.MapString("hello", dimension) << endl; - Log::Info << "dude is mapped to" << richinfo.MapString("dude", dimension) << endl; - - for (size_t i = 0; i < data.n_rows; ++i) + for (size_t i = 0; i < input.n_rows; ++i) { - Log::Info << richinfo.NumMappings(i) << " mappings in dimension " + Log::Info << info.NumMappings(i) << " mappings in feature " << i << "." << endl; } + arma::Mat output(input); + data::Imputer< data::MeanStrategy, data::DatasetInfo, double> impu; - impu.Impute(input, output, richinfo, missValue, dimension); + impu.Impute(input, output, info, missingValue, feature); Log::Info << "input::" << endl; Log::Info << input << endl; Log::Info << "output::" << endl; Log::Info << output << endl; - Log::Info << "<><><><>END<><><><>" << endl; - -/****************************/ - if (!outputFile.empty()) { Log::Info << "Saving model to '" << outputFile << "'." << endl; - data::Save(outputFile, data, false); + data::Save(outputFile, output, false); } } From 2d38604478e7ab84407e01138e4158090002458f Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 13 Jun 2016 16:20:48 +0900 Subject: [PATCH 06/40] modify imputer_main and clean logs --- src/mlpack/core/data/imputer.hpp | 37 ++++--------------- .../map_policies/increment_map_policy.hpp | 2 +- .../data/map_policies/missing_map_policy.hpp | 1 - .../preprocess/preprocess_imputer_main.cpp | 6 +-- 4 files changed, 11 insertions(+), 35 deletions(-) diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index f67dc6c9e59..03ac6ae0f64 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -9,6 +9,7 @@ #define MLPACK_CORE_DATA_IMPUTER_HPP #include +#include namespace mlpack { namespace data { @@ -21,7 +22,7 @@ namespace data { * @tparam Mapper that is used to hold dataset information. * @tparam primitive type of input and output's armadillo matrix. */ -template +template class Imputer { private: @@ -50,43 +51,31 @@ class Imputer const size_t dimension, const bool transpose = true) { + // find mapped value inside current mapper auto mappedValue = mapper.UnmapValue(targetValue, dimension); - Log::Info << "<>" << std::endl; - Log::Info << "<>mapped value<>: " << mappedValue << std::endl; + if(transpose) { for (size_t i = 0; i < input.n_rows; ++i) { - Log::Info << " input=> " << input(dimension, i) << " mappedValue=> "<< mappedValue << std::endl; if (input(dimension, i) == mappedValue) { // users can specify the imputation strategies likes // mean, mode, etc using the class'es template parameter: Strategy. - Log::Info << "<>" << std::endl; strat.template Impute(input, output, dimension, i, transpose); } - else - { - Log::Info << "" << std::endl; - } } } else { for (size_t i = 0; i < input.n_cols; ++i) { - Log::Info << " input=> " << input(dimension, i) << " mappedValue=> "<< mappedValue << std::endl; if (input(i, dimension) == mappedValue) { - Log::Info << "<>" << std::endl; - strat.template Impute(input, output, i, dimension, transpose); - } - else { - Log::Info << "" << std::endl; + strat.template Impute(input, output, i, dimension, transpose); } } } - Log::Info << "" << std::endl; } /** @@ -102,42 +91,30 @@ class Imputer const size_t dimension, const bool transpose = true) { + // find mapped value inside current mapper auto mappedValue = mapper.UnmapValue(targetValue, dimension); - Log::Info << "<>" << std::endl; - Log::Info << "<>mapped value<>: " << mappedValue << std::endl; + if(transpose) { for (size_t i = 0; i < input.n_rows; ++i) { - Log::Info << " input=> " << input(dimension, i) << " mappedValue=> "<< mappedValue << std::endl; if (input(dimension, i) == mappedValue) { // replace the target value to custom value - Log::Info << "<>" << std::endl; output(dimension, i) = customValue; } - else - { - Log::Info << "" << std::endl; - } } } else { for (size_t i = 0; i < input.n_cols; ++i) { - Log::Info << " input=> " << input(dimension, i) << " mappedValue=> "<< mappedValue << std::endl; if (input(i, dimension) == mappedValue) { - Log::Info << "<>" << std::endl; output(i, dimension) = customValue; } - else { - Log::Info << "" << std::endl; - } } } - Log::Info << "" << std::endl; } }; // class Imputer diff --git a/src/mlpack/core/data/map_policies/increment_map_policy.hpp b/src/mlpack/core/data/map_policies/increment_map_policy.hpp index e0ee6687f48..713c3fc6731 100644 --- a/src/mlpack/core/data/map_policies/increment_map_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_map_policy.hpp @@ -19,7 +19,7 @@ namespace data { /** * This class is used to map strings to incrementing unsigned integers (size_t). - * First string to be mapped will be mapped to 0, next to 1, 2, and so on. + * First string to be mapped will be mapped to 0, next to 1 and so on. */ class IncrementMapPolicy { diff --git a/src/mlpack/core/data/map_policies/missing_map_policy.hpp b/src/mlpack/core/data/map_policies/missing_map_policy.hpp index 81d5be31fcb..f93a0916a05 100644 --- a/src/mlpack/core/data/map_policies/missing_map_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_map_policy.hpp @@ -30,7 +30,6 @@ class MissingMapPolicy const std::string& string, const size_t dimension) { - Log::Info << "MISSING MAP POLICY MAPSTRING" << string << dimension << endl; // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 145acce6146..bbdf06bd239 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -21,7 +21,7 @@ PARAM_STRING("missing_value", "User defined missing value", "m", "") PARAM_STRING("map_policy", "mapping policy to be used while loading", "p", "") PARAM_STRING("map_to", "custom_strategy option. map to something else", "t", "") PARAM_STRING("impute_strategy", "imputation strategy to be applied", "s", "") -PARAM_DOUBLE("custom_value", "user_defined custom value", "c", "") +PARAM_DOUBLE("custom_value", "user_defined custom value", "c", 0.0) PARAM_INT("feature", "the feature to apply imputation", "f", 0); using namespace mlpack; @@ -54,9 +54,9 @@ int main(int argc, char** argv) arma::Mat output(input); data::Imputer< - data::MeanStrategy, + arma::Mat, data::DatasetInfo, - double> impu; + data::MeanStrategy> impu; impu.Impute(input, output, info, missingValue, feature); From bb045b815855a75a2bda8e2d2c235ba6d98819d1 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 13 Jun 2016 17:58:02 +0900 Subject: [PATCH 07/40] add parameter verification for imputer_main --- .../preprocess/preprocess_imputer_main.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index bbdf06bd239..cf4fbdbc7e1 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -19,7 +19,6 @@ PARAM_STRING_REQ("input_file", "File containing data,", "i"); PARAM_STRING("output_file", "File to save output", "o", ""); PARAM_STRING("missing_value", "User defined missing value", "m", "") PARAM_STRING("map_policy", "mapping policy to be used while loading", "p", "") -PARAM_STRING("map_to", "custom_strategy option. map to something else", "t", "") PARAM_STRING("impute_strategy", "imputation strategy to be applied", "s", "") PARAM_DOUBLE("custom_value", "user_defined custom value", "c", 0.0) PARAM_INT("feature", "the feature to apply imputation", "f", 0); @@ -34,10 +33,23 @@ int main(int argc, char** argv) CLI::ParseCommandLine(argc, argv); const string inputFile = CLI::GetParam("input_file"); - const string missingValue = CLI::GetParam("missing_value"); const string outputFile = CLI::GetParam("output_file"); + const string missingValue = CLI::GetParam("missing_value"); + const string mapPolicy = CLI::GetParam("map_policy"); + const string imputeStrategy = CLI::GetParam("impute_strategy"); + const double customValue = CLI::GetParam("custom_value"); const size_t feature = (size_t) CLI::GetParam("feature"); + // warn if user did not specify output_file + if (!CLI::HasParam("output_file")) + Log::Warn << "--output_file is not specified, no " + << "results from this program will be saved!" << endl; + + if (CLI::HasParam("custom_value") && !CLI::HasParam("impute_strategy")) + Log::Warm << "--custom_value is specified without --impute_strategy " + << "automatically setting --impute_strategy to CustomStrategy" + << endl; + arma::mat input; data::DatasetInfo info; From 1295f4b9e289843d6c0af28289d971c897912073 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 13 Jun 2016 21:07:23 +0900 Subject: [PATCH 08/40] add custom strategy to impute_main --- .../preprocess/preprocess_imputer_main.cpp | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index cf4fbdbc7e1..f17de1c3e6a 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -45,10 +45,16 @@ int main(int argc, char** argv) Log::Warn << "--output_file is not specified, no " << "results from this program will be saved!" << endl; - if (CLI::HasParam("custom_value") && !CLI::HasParam("impute_strategy")) - Log::Warm << "--custom_value is specified without --impute_strategy " - << "automatically setting --impute_strategy to CustomStrategy" + if (CLI::HasParam("custom_value") && !(imputeStrategy == "custom")) + { + Log::Warn << "--custom_value is specified without --impute_strategy, " + << "--impute_strategy is automatically set to CustomStrategy." << endl; + } + + if ((imputeStrategy == "custom") && !CLI::HasParam("custom_value")) + Log::Fatal << "--custom_value must be specified when using " + << "'custom' strategy" << endl; arma::mat input; data::DatasetInfo info; @@ -65,13 +71,27 @@ int main(int argc, char** argv) arma::Mat output(input); - data::Imputer< - arma::Mat, - data::DatasetInfo, - data::MeanStrategy> impu; - impu.Impute(input, output, info, missingValue, feature); + if (imputeStrategy == "custom") + { + data::Imputer, + data::DatasetInfo, + data::CustomStrategy> impu; + impu.template Impute(input, + output, + info, + missingValue, + customValue, + feature); + } + else + { + data::Imputer, + data::DatasetInfo, + data::MeanStrategy> impu; + impu.Impute(input, output, info, missingValue, feature); + } Log::Info << "input::" << endl; Log::Info << input << endl; Log::Info << "output::" << endl; From 5a517c25ef55de1f4814dc3605190d17f868ff82 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Tue, 14 Jun 2016 11:26:20 +0900 Subject: [PATCH 09/40] add datatype change in IncrementPolicy --- src/mlpack/core/data/dataset_info.hpp | 56 +++++++++---------- src/mlpack/core/data/dataset_info_impl.hpp | 26 ++++----- .../impute_strategies/custom_strategy.hpp | 1 + .../data/impute_strategies/mean_strategy.hpp | 20 ++++++- .../impute_strategies/median_strategy.hpp | 1 + .../data/impute_strategies/mode_strategy.hpp | 2 + src/mlpack/core/data/imputer.hpp | 6 +- .../core/data/map_policies/CMakeLists.txt | 4 +- ...nt_map_policy.hpp => increment_policy.hpp} | 24 +++++--- ...sing_map_policy.hpp => missing_policy.hpp} | 19 +++++-- .../preprocess/preprocess_imputer_main.cpp | 48 ++++++++++++---- 11 files changed, 135 insertions(+), 72 deletions(-) rename src/mlpack/core/data/map_policies/{increment_map_policy.hpp => increment_policy.hpp} (71%) rename src/mlpack/core/data/map_policies/{missing_map_policy.hpp => missing_policy.hpp} (75%) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 2d60852f681..b2124b8bdd9 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -2,9 +2,9 @@ * @file dataset_info.hpp * @author Ryan Curtin * - * Defines the DatasetInfo class, which holds information about a dataset. This - * is useful when the dataset contains categorical non-numeric features that - * needs to be mapped to categorical numeric features. + * Defines the DatasetMapper class, which holds information about a dataset. + * This is useful when the dataset contains categorical non-numeric features + * that needs to be mapped to categorical numeric features. */ #ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP #define MLPACK_CORE_DATA_DATASET_INFO_HPP @@ -13,29 +13,29 @@ #include #include -#include "map_policies/increment_map_policy.hpp" +#include "map_policies/increment_policy.hpp" namespace mlpack { namespace data { /** - * The Datatype enum specifies the types of data mlpack algorithms can use. The - * vast majority of mlpack algorithms can only use numeric data (i.e. + * The Datatype enum specifies the types of data mlpack algorithms can use. + * The vast majority of mlpack algorithms can only use numeric data (i.e. * float/double/etc.), but some algorithms can use categorical data, specified - * via this Datatype enum and the DatasetInfo class. + * via this Datatype enum and the DatasetMapper class. */ -enum Datatype : bool /* bool is all the precision we need for two types */ -{ - numeric = 0, - categorical = 1 -}; +/*enum Datatype : bool [> bool is all the precision we need for two types <]*/ +/*{*/ + /*numeric = 0,*/ + /*categorical = 1*/ +/*};*/ /** * Auxiliary information for a dataset, including mappings to/from strings and - * the datatype of each dimension. DatasetInfo objects are optionally produced - * by data::Load(), and store the type of each dimension (Datatype::numeric or - * Datatype::categorical) as well as mappings from strings to unsigned integers - * and vice versa. + * the datatype of each dimension. DatasetMapper objects are optionally + * produced by data::Load(), and store the type of each dimension + * (Datatype::numeric or Datatype::categorical) as well as mappings from strings + * to unsigned integers and vice versa. * * @tparam MapPolicy Mapping policy used to specify MapString(); */ @@ -44,9 +44,9 @@ class DatasetMapper { public: /** - * Create the DatasetInfo object with the given dimensionality. Note that the - * dimensionality cannot be changed later; you will have to create a new - * DatasetInfo object. + * Create the DatasetMapper object with the given dimensionality. Note that + * the dimensionality cannot be changed later; you will have to create a new + * DatasetMapper object. */ DatasetMapper(const size_t dimensionality = 0); @@ -82,7 +82,7 @@ class DatasetMapper * @param dimension Dimension to unmap string from. */ typename MapPolicy::map_type_t UnmapValue(const std::string& string, - const size_t dimension) const; + const size_t dimension); //! Return the type of a given dimension (numeric or categorical). Datatype Type(const size_t dimension) const; @@ -96,7 +96,7 @@ class DatasetMapper size_t NumMappings(const size_t dimension) const; /** - * Get the dimensionality of the DatasetInfo object (that is, how many + * Get the dimensionality of the DatasetMapper object (that is, how many * dimensions it has information for). If this object was created by a call * to mlpack::data::Load(), then the dimensionality will be the same as the * number of rows (dimensions) in the dataset. @@ -117,12 +117,12 @@ class DatasetMapper //! Types of each dimension. std::vector types; - //! Mappings from strings to integers. Map entries will only exist for - //! dimensions that are categorical. - typedef std::unordered_map, - size_t>> MapType; + // BiMapType definition + using BiMapType = boost::bimap; + + // Mappings from strings to integers. + // Map entries will only exist for dimensions that are categorical. + using MapType = std::unordered_map>; MapType maps; @@ -130,7 +130,7 @@ class DatasetMapper }; // Use typedef to provide backward compatibility -using DatasetInfo = DatasetMapper; +using DatasetInfo = DatasetMapper; } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index 33ea5ef0b68..e55ab4df31c 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -29,7 +29,7 @@ inline typename MapPolicy::map_type_t DatasetMapper::MapString( const std::string& string, const size_t dimension) { - return policy.template MapString(maps, string, dimension); + return policy.template MapString(maps, types, string, dimension); } // Return the string corresponding to a value in a given dimension. @@ -54,19 +54,18 @@ inline const std::string& DatasetMapper::UnmapString( template inline typename MapPolicy::map_type_t DatasetMapper::UnmapValue( const std::string& string, - const size_t dimension) const + const size_t dimension) { - return 0; // Throw an exception if the value doesn't exist. - //if (maps[dimension].first.left.count(string) == 0) - //{ - //std::ostringstream oss; - //oss << "DatasetMapper::UnmapValue(): string '" << string - //<< "' unknown for dimension " << dimension; - //throw std::invalid_argument(oss.str()); - //} - - //return maps[dimension].first.left.at(string); + if (maps[dimension].first.left.count(string) == 0) + { + std::ostringstream oss; + oss << "DatasetMapper::UnmapValue(): string '" << string + << "' unknown for dimension " << dimension; + throw std::invalid_argument(oss.str()); + } + + return maps[dimension].first.left.at(string); } // Get the type of a particular dimension. @@ -94,7 +93,8 @@ inline Datatype& DatasetMapper::Type(const size_t dimension) } template -inline size_t DatasetMapper::NumMappings(const size_t dimension) const +inline +size_t DatasetMapper::NumMappings(const size_t dimension) const { return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second; } diff --git a/src/mlpack/core/data/impute_strategies/custom_strategy.hpp b/src/mlpack/core/data/impute_strategies/custom_strategy.hpp index e12379c21fe..a8685f3ef2d 100644 --- a/src/mlpack/core/data/impute_strategies/custom_strategy.hpp +++ b/src/mlpack/core/data/impute_strategies/custom_strategy.hpp @@ -2,6 +2,7 @@ * @file custom_strategy.hpp * @author Keon Kim * + * Definition and Implementation of the empty CustomStrategy class. */ #ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_STRATEGY_HPP #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_STRATEGY_HPP diff --git a/src/mlpack/core/data/impute_strategies/mean_strategy.hpp b/src/mlpack/core/data/impute_strategies/mean_strategy.hpp index b24eb236213..223328a7df2 100644 --- a/src/mlpack/core/data/impute_strategies/mean_strategy.hpp +++ b/src/mlpack/core/data/impute_strategies/mean_strategy.hpp @@ -2,6 +2,7 @@ * @file mean_strategy.hpp * @author Keon Kim * + * Definition and Implementation of the MeanStrategy class. */ #ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_STRATEGY_HPP #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_STRATEGY_HPP @@ -14,10 +15,23 @@ using namespace std; namespace mlpack { namespace data { +/** + * The MeanStrategy + */ class MeanStrategy { public: + /** + * Computes mean, excluding NaN or target missing variables + * + * TODO: write docs for parameters + * @param input + * @param output + * @param dimension + * @param index + * @param transpose + */ template void Impute(const MatType &input, MatType &output, @@ -27,16 +41,20 @@ class MeanStrategy { if (transpose) { + // TODO: The mean must be calculated + // without NaN or target missing variable. MatType meanMat = arma::mean(input, 1); output(dimension, index) = meanMat(dimension); } else { + // TODO: The mean must be calculated + // without NaN or target missing variable. MatType meanMat = arma::mean(input, 0); output(index, dimension) = meanMat(index); } } -}; +}; // class MeanStrategy } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/impute_strategies/median_strategy.hpp b/src/mlpack/core/data/impute_strategies/median_strategy.hpp index d9de2aefd03..226df31ee34 100644 --- a/src/mlpack/core/data/impute_strategies/median_strategy.hpp +++ b/src/mlpack/core/data/impute_strategies/median_strategy.hpp @@ -2,6 +2,7 @@ * @file median_strategy.hpp * @author Keon Kim * + * Definition and Implementation of the MedianStrategy class. */ #ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEDIAN_STRATEGY_HPP #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEDIAN_STRATEGY_HPP diff --git a/src/mlpack/core/data/impute_strategies/mode_strategy.hpp b/src/mlpack/core/data/impute_strategies/mode_strategy.hpp index 7f1416136e3..f78ad784c3e 100644 --- a/src/mlpack/core/data/impute_strategies/mode_strategy.hpp +++ b/src/mlpack/core/data/impute_strategies/mode_strategy.hpp @@ -2,6 +2,7 @@ * @file mode_strategy.hpp * @author Keon Kim * + * Definition and Implementation of the ModeStrategy class. */ #ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MODE_STRATEGY_HPP #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MODE_STRATEGY_HPP @@ -23,6 +24,7 @@ class ModeStrategy const size_t dimension, const size_t index) { + // TODO: implement this // considering use of arma::hist() output(dimension, index) = 99; cout << "IMPUTE CALLED CUSTOM MAP STRATEGY" << endl; diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index 03ac6ae0f64..8248ffcaea1 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -22,7 +22,7 @@ namespace data { * @tparam Mapper that is used to hold dataset information. * @tparam primitive type of input and output's armadillo matrix. */ -template +template class Imputer { private: @@ -46,7 +46,7 @@ class Imputer */ void Impute(const MatType &input, MatType &output, - const Mapper &mapper, + Mapper &mapper, const std::string &targetValue, const size_t dimension, const bool transpose = true) @@ -85,7 +85,7 @@ class Imputer template void Impute(const arma::Mat &input, arma::Mat &output, - const Mapper &mapper, + Mapper &mapper, const std::string &targetValue, const T &customValue, const size_t dimension, diff --git a/src/mlpack/core/data/map_policies/CMakeLists.txt b/src/mlpack/core/data/map_policies/CMakeLists.txt index 6cfb0ee041c..92f5820e975 100644 --- a/src/mlpack/core/data/map_policies/CMakeLists.txt +++ b/src/mlpack/core/data/map_policies/CMakeLists.txt @@ -1,8 +1,8 @@ # Define the files we need to compile # Anything not in this list will not be compiled into mlpack. set(SOURCES - increment_map_policy.hpp - missing_map_policy.hpp + increment__policy.hpp + missing__policy.hpp ) # Add directory name to sources. diff --git a/src/mlpack/core/data/map_policies/increment_map_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp similarity index 71% rename from src/mlpack/core/data/map_policies/increment_map_policy.hpp rename to src/mlpack/core/data/map_policies/increment_policy.hpp index 713c3fc6731..cafe6d44b29 100644 --- a/src/mlpack/core/data/map_policies/increment_map_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_policy.hpp @@ -1,11 +1,11 @@ /** - * @file increment_map_policy.hpp + * @file increment_policy.hpp * @author Keon Kim * - * Default increment map policy for dataset info. + * Default increment maping policy for dataset info. */ -#ifndef MLPACK_CORE_DATA_MAP_POLICIES_INCREMENT_MAP_POLICY_HPP -#define MLPACK_CORE_DATA_MAP_POLICIES_INCREMENT_MAP_POLICY_HPP +#ifndef MLPACK_CORE_DATA_MAP_POLICIES_INCREMENT_POLICY_HPP +#define MLPACK_CORE_DATA_MAP_POLICIES_INCREMENT_POLICY_HPP #include #include @@ -17,17 +17,25 @@ using namespace std; namespace mlpack { namespace data { +enum Datatype : bool /* bool is all the precision we need for two types */ +{ + numeric = 0, + categorical = 1 +}; + + /** * This class is used to map strings to incrementing unsigned integers (size_t). * First string to be mapped will be mapped to 0, next to 1 and so on. */ -class IncrementMapPolicy +class IncrementPolicy { public: typedef size_t map_type_t; template map_type_t MapString(MapType& maps, + std::vector& types, const std::string& string, const size_t dimension) { @@ -39,8 +47,8 @@ class IncrementMapPolicy { // This string does not exist yet. size_t& numMappings = maps[dimension].second; - //if (numMappings == 0) - //types[dimension] = Datatype::categorical; + if (numMappings == 0) + types[dimension] = Datatype::categorical; typedef boost::bimap::value_type PairType; maps[dimension].first.insert(PairType(string, numMappings)); return numMappings++; @@ -51,7 +59,7 @@ class IncrementMapPolicy return maps[dimension].first.left.at(string); } } -}; // class IncrementMapPolicy +}; // class IncrementPolicy } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/map_policies/missing_map_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp similarity index 75% rename from src/mlpack/core/data/map_policies/missing_map_policy.hpp rename to src/mlpack/core/data/map_policies/missing_policy.hpp index f93a0916a05..8678a82b272 100644 --- a/src/mlpack/core/data/map_policies/missing_map_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -1,11 +1,11 @@ /** - * @file missing_map_policy.hpp + * @file missing_policy.hpp * @author Keon Kim * * Missing map policy for dataset info. */ -#ifndef MLPACK_CORE_DATA_MAP_POLICIES_MISSING_MAP_POLICY_HPP -#define MLPACK_CORE_DATA_MAP_POLICIES_MISSING_MAP_POLICY_HPP +#ifndef MLPACK_CORE_DATA_MAP_POLICIES_MISSING_POLICY_HPP +#define MLPACK_CORE_DATA_MAP_POLICIES_MISSING_POLICY_HPP #include #include @@ -17,16 +17,25 @@ using namespace std; namespace mlpack { namespace data { +// TODO: move this to somewhere else so that it can be reused. +enum Datatype : bool /* bool is all the precision we need for two types */ +{ + numeric = 0, + categorical = 1 +}; + + /** * Same as increment map policy so far. */ -class MissingMapPolicy +class MissingPolicy { public: typedef size_t map_type_t; template map_type_t MapString(MapType& maps, + std::vector& types, const std::string& string, const size_t dimension) { @@ -50,7 +59,7 @@ class MissingMapPolicy return maps[dimension].first.left.at(string); } } -}; // class IncrementMapPolicy +}; // class MissingPolicy } // namespace data } // namespace mlpack diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index f17de1c3e6a..0030b9cc0bd 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include PROGRAM_INFO("Imputer", "This " @@ -26,6 +26,7 @@ PARAM_INT("feature", "the feature to apply imputation", "f", 0); using namespace mlpack; using namespace arma; using namespace std; +using namespace data; int main(int argc, char** argv) { @@ -36,47 +37,68 @@ int main(int argc, char** argv) const string outputFile = CLI::GetParam("output_file"); const string missingValue = CLI::GetParam("missing_value"); const string mapPolicy = CLI::GetParam("map_policy"); - const string imputeStrategy = CLI::GetParam("impute_strategy"); const double customValue = CLI::GetParam("custom_value"); const size_t feature = (size_t) CLI::GetParam("feature"); + string imputeStrategy = CLI::GetParam("impute_strategy"); + + // missing value should be specified + if (!CLI::HasParam("missing_value")) + Log::Fatal << "--missing_value must be specified in order to perform " + << "any imputation strategies." << endl; // warn if user did not specify output_file if (!CLI::HasParam("output_file")) Log::Warn << "--output_file is not specified, no " << "results from this program will be saved!" << endl; + // if custom value is specified, and imputation strategy is not, + // set imputation strategy to "custom" if (CLI::HasParam("custom_value") && !(imputeStrategy == "custom")) { + imputeStrategy = "custom"; Log::Warn << "--custom_value is specified without --impute_strategy, " - << "--impute_strategy is automatically set to CustomStrategy." + << "--impute_strategy is automatically set to 'custom'." << endl; } + // custom_value must be specified when using "custom" imputation strategy if ((imputeStrategy == "custom") && !CLI::HasParam("custom_value")) Log::Fatal << "--custom_value must be specified when using " << "'custom' strategy" << endl; arma::mat input; - data::DatasetInfo info; + // DatasetInfo holds how the DatasetMapper should map the values. + // can be specified by passing map_policy classes as template parameters + // ex) DatasetMapper info; + using Mapper = DatasetMapper; + Mapper info; - data::Load(inputFile, input, info, true, true); + Load(inputFile, input, info, true, true); + // for testing purpose Log::Info << input << endl; + // print how many mapping exist in each features for (size_t i = 0; i < input.n_rows; ++i) { Log::Info << info.NumMappings(i) << " mappings in feature " << i << "." << endl; } + arma::Mat output(input); + Log::Info << "Performing '" << imputeStrategy << "' imputation strategy " + << "to feature '" << feature <<"' of '" << inputFile << "'." + << endl; + if (imputeStrategy == "custom") { - data::Imputer, - data::DatasetInfo, - data::CustomStrategy> impu; + Log::Info << "Replacing all '" << missingValue << "' with '" << customValue + << "'." << endl; + + Imputer, Mapper, CustomStrategy> impu; impu.template Impute(input, output, info, @@ -86,12 +108,14 @@ int main(int argc, char** argv) } else { - data::Imputer, - data::DatasetInfo, - data::MeanStrategy> impu; + Log::Info << "Replacing all '" << missingValue << "' with '" << imputeStrategy + << "'." << endl; + Imputer, Mapper, MeanStrategy> impu; impu.Impute(input, output, info, missingValue, feature); } + + // for testing purpose Log::Info << "input::" << endl; Log::Info << input << endl; Log::Info << "output::" << endl; @@ -100,7 +124,7 @@ int main(int argc, char** argv) if (!outputFile.empty()) { Log::Info << "Saving model to '" << outputFile << "'." << endl; - data::Save(outputFile, output, false); + Save(outputFile, output, false); } } From 94b7a5c65b98271712c00ffe5673cdeb28413808 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Tue, 14 Jun 2016 20:51:15 +0900 Subject: [PATCH 10/40] update types used in datasetinfo --- src/mlpack/core/data/dataset_info.hpp | 6 ++-- src/mlpack/core/data/dataset_info_impl.hpp | 4 +-- .../core/data/map_policies/CMakeLists.txt | 4 +-- .../core/data/map_policies/datatype.hpp | 30 +++++++++++++++++++ .../data/map_policies/increment_policy.hpp | 13 ++------ .../core/data/map_policies/missing_policy.hpp | 13 ++------ 6 files changed, 43 insertions(+), 27 deletions(-) create mode 100644 src/mlpack/core/data/map_policies/datatype.hpp diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index b2124b8bdd9..83996af7d76 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -59,7 +59,7 @@ class DatasetMapper * @param string String to find/create mapping for. * @param dimension Index of the dimension of the string. */ - typename MapPolicy::map_type_t MapString(const std::string& string, + typename MapPolicy::mapped_type MapString(const std::string& string, const size_t dimension); /** @@ -81,7 +81,7 @@ class DatasetMapper * @param string Mapped string for value. * @param dimension Dimension to unmap string from. */ - typename MapPolicy::map_type_t UnmapValue(const std::string& string, + typename MapPolicy::mapped_type UnmapValue(const std::string& string, const size_t dimension); //! Return the type of a given dimension (numeric or categorical). @@ -118,7 +118,7 @@ class DatasetMapper std::vector types; // BiMapType definition - using BiMapType = boost::bimap; + using BiMapType = boost::bimap; // Mappings from strings to integers. // Map entries will only exist for dimensions that are categorical. diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index e55ab4df31c..b8e09f7f589 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -25,7 +25,7 @@ inline DatasetMapper::DatasetMapper(const size_t dimensionality) : // When we want to insert value into the map, // we could use the policy to map the string template -inline typename MapPolicy::map_type_t DatasetMapper::MapString( +inline typename MapPolicy::mapped_type DatasetMapper::MapString( const std::string& string, const size_t dimension) { @@ -52,7 +52,7 @@ inline const std::string& DatasetMapper::UnmapString( // Return the value corresponding to a string in a given dimension. template -inline typename MapPolicy::map_type_t DatasetMapper::UnmapValue( +inline typename MapPolicy::mapped_type DatasetMapper::UnmapValue( const std::string& string, const size_t dimension) { diff --git a/src/mlpack/core/data/map_policies/CMakeLists.txt b/src/mlpack/core/data/map_policies/CMakeLists.txt index 92f5820e975..9b40fcc19ce 100644 --- a/src/mlpack/core/data/map_policies/CMakeLists.txt +++ b/src/mlpack/core/data/map_policies/CMakeLists.txt @@ -1,8 +1,8 @@ # Define the files we need to compile # Anything not in this list will not be compiled into mlpack. set(SOURCES - increment__policy.hpp - missing__policy.hpp + increment_policy.hpp + missing_policy.hpp ) # Add directory name to sources. diff --git a/src/mlpack/core/data/map_policies/datatype.hpp b/src/mlpack/core/data/map_policies/datatype.hpp new file mode 100644 index 00000000000..0cafba8e672 --- /dev/null +++ b/src/mlpack/core/data/map_policies/datatype.hpp @@ -0,0 +1,30 @@ +/** + * @file missing_policy.hpp + * @author Keon Kim + * + */ +#ifndef MLPACK_CORE_DATA_MAP_POLICIES_DATATYPE_HPP +#define MLPACK_CORE_DATA_MAP_POLICIES_DATATYPE_HPP + +#include + +namespace mlpack { +namespace data { + +/** + * The Datatype enum specifies the types of data mlpack algorithms can use. + * The vast majority of mlpack algorithms can only use numeric data (i.e. + * float/double/etc.), but some algorithms can use categorical data, specified + * via this Datatype enum and the DatasetMapper class. + */ +enum Datatype : bool /* [> bool is all the precision we need for two types <] */ +{ + numeric = 0, + categorical = 1 +}; + + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp index cafe6d44b29..b83e832b95e 100644 --- a/src/mlpack/core/data/map_policies/increment_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_policy.hpp @@ -10,20 +10,13 @@ #include #include #include - +#include using namespace std; namespace mlpack { namespace data { -enum Datatype : bool /* bool is all the precision we need for two types */ -{ - numeric = 0, - categorical = 1 -}; - - /** * This class is used to map strings to incrementing unsigned integers (size_t). * First string to be mapped will be mapped to 0, next to 1 and so on. @@ -31,10 +24,10 @@ enum Datatype : bool /* bool is all the precision we need for two types */ class IncrementPolicy { public: - typedef size_t map_type_t; + typedef size_t mapped_type; template - map_type_t MapString(MapType& maps, + mapped_type MapString(MapType& maps, std::vector& types, const std::string& string, const size_t dimension) diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index 8678a82b272..c8be3bd8c2a 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -10,6 +10,7 @@ #include #include #include +#include using namespace std; @@ -17,24 +18,16 @@ using namespace std; namespace mlpack { namespace data { -// TODO: move this to somewhere else so that it can be reused. -enum Datatype : bool /* bool is all the precision we need for two types */ -{ - numeric = 0, - categorical = 1 -}; - - /** * Same as increment map policy so far. */ class MissingPolicy { public: - typedef size_t map_type_t; + typedef size_t mapped_type; template - map_type_t MapString(MapType& maps, + mapped_type MapString(MapType& maps, std::vector& types, const std::string& string, const size_t dimension) From ebed68fef25bd3e23769ec7073fd509df1259214 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Tue, 14 Jun 2016 21:07:05 +0900 Subject: [PATCH 11/40] initialize imputer with parameters --- src/mlpack/core/data/imputer.hpp | 50 ++++++++++++++----- .../preprocess/preprocess_imputer_main.cpp | 16 ++++-- 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index 8248ffcaea1..10311271c34 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -25,12 +25,20 @@ namespace data { template class Imputer { - private: - Strategy strat; public: - Imputer() + Imputer(Mapper mapper, bool transpose =true): + mapper(std::move(mapper)), + transpose(transpose) + { + // nothing to initialize here + } + + Imputer(Strategy strat, Mapper mapper, bool traspose = true): + strat(std::move(strat)), + mapper(std::move(mapper)), + transpose(transpose) { - // nothing to initialize + // nothing to initialize here } /** @@ -45,11 +53,9 @@ class Imputer * @param transpose. */ void Impute(const MatType &input, - MatType &output, - Mapper &mapper, - const std::string &targetValue, - const size_t dimension, - const bool transpose = true) + MatType &output, + const std::string &targetValue, + const size_t dimension) { // find mapped value inside current mapper auto mappedValue = mapper.UnmapValue(targetValue, dimension); @@ -85,11 +91,9 @@ class Imputer template void Impute(const arma::Mat &input, arma::Mat &output, - Mapper &mapper, const std::string &targetValue, const T &customValue, - const size_t dimension, - const bool transpose = true) + const size_t dimension) { // find mapped value inside current mapper auto mappedValue = mapper.UnmapValue(targetValue, dimension); @@ -117,6 +121,28 @@ class Imputer } } + //! Get the strategy + const Strategy& Strategy() const { return strat } + + //! Modify the given strategy (be careful!) + Strategy& Strategy() { return strat } + + //! Get the mapper + const Mapper& Mapper() const { return mapper } + + //! Modify the given mapper (be careful!) + Mapper& Mapper() { return mapper } + + private: + // Imputation Strategy + Strategy strat; + + // DatasetMapper + Mapper mapper; + + // save transpose as a member variable since it is rarely changed. + bool transpose; + }; // class Imputer } // namespace data diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 0030b9cc0bd..bd96a7e0154 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -53,7 +53,7 @@ int main(int argc, char** argv) // if custom value is specified, and imputation strategy is not, // set imputation strategy to "custom" - if (CLI::HasParam("custom_value") && !(imputeStrategy == "custom")) + if (CLI::HasParam("custom_value") && !CLI::HasParam("impute_strategy")) { imputeStrategy = "custom"; Log::Warn << "--custom_value is specified without --impute_strategy, " @@ -61,6 +61,13 @@ int main(int argc, char** argv) << endl; } + // custom value and any other impute strategies cannot be specified at + // the same time. + if (CLI::HasParam("custom_value") && CLI::HasParam("impute_strategy") && + imputeStrategy != "custom") + Log::Fatal << "--custom_value cannot be specified with " + << "impute strategies excluding 'custom' strategy" << endl; + // custom_value must be specified when using "custom" imputation strategy if ((imputeStrategy == "custom") && !CLI::HasParam("custom_value")) Log::Fatal << "--custom_value must be specified when using " @@ -98,10 +105,9 @@ int main(int argc, char** argv) Log::Info << "Replacing all '" << missingValue << "' with '" << customValue << "'." << endl; - Imputer, Mapper, CustomStrategy> impu; + Imputer, Mapper, CustomStrategy> impu(info); impu.template Impute(input, output, - info, missingValue, customValue, feature); @@ -111,8 +117,8 @@ int main(int argc, char** argv) Log::Info << "Replacing all '" << missingValue << "' with '" << imputeStrategy << "'." << endl; - Imputer, Mapper, MeanStrategy> impu; - impu.Impute(input, output, info, missingValue, feature); + Imputer, Mapper, MeanStrategy> impu(info); + impu.Impute(input, output, missingValue, feature); } // for testing purpose From db78f394f13f0f58df8344ddc91b40bb599762eb Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Wed, 15 Jun 2016 09:55:34 +0900 Subject: [PATCH 12/40] remove datatype in dataset_info --- src/mlpack/core/data/dataset_info.hpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 83996af7d76..8eea1c83aeb 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -17,19 +17,6 @@ namespace mlpack { namespace data { - -/** - * The Datatype enum specifies the types of data mlpack algorithms can use. - * The vast majority of mlpack algorithms can only use numeric data (i.e. - * float/double/etc.), but some algorithms can use categorical data, specified - * via this Datatype enum and the DatasetMapper class. - */ -/*enum Datatype : bool [> bool is all the precision we need for two types <]*/ -/*{*/ - /*numeric = 0,*/ - /*categorical = 1*/ -/*};*/ - /** * Auxiliary information for a dataset, including mappings to/from strings and * the datatype of each dimension. DatasetMapper objects are optionally From da4e40981c40e171858f41de4e85e08af2c7e48d Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Wed, 15 Jun 2016 12:25:10 +0900 Subject: [PATCH 13/40] add test for imputer --- src/mlpack/tests/CMakeLists.txt | 1 + src/mlpack/tests/imputer_test.cpp | 160 ++++++++++++++++++++++++++++ src/mlpack/tests/load_save_test.cpp | 2 + 3 files changed, 163 insertions(+) create mode 100644 src/mlpack/tests/imputer_test.cpp diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index 8b36a941c97..034b15eee2d 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -22,6 +22,7 @@ add_executable(mlpack_test gmm_test.cpp hmm_test.cpp hoeffding_tree_test.cpp + imputer_test.cpp ind2sub_test.cpp init_rules_test.cpp kernel_test.cpp diff --git a/src/mlpack/tests/imputer_test.cpp b/src/mlpack/tests/imputer_test.cpp new file mode 100644 index 00000000000..0009917df37 --- /dev/null +++ b/src/mlpack/tests/imputer_test.cpp @@ -0,0 +1,160 @@ +/** + * @file imputer_test.cpp + * @author Keon Kim + * + * Tests for data::Imputer class + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "old_boost_test_definitions.hpp" + +using namespace mlpack; +using namespace mlpack::data; +using namespace std; + +BOOST_AUTO_TEST_SUITE(ImputerTest); + +/** + * Make sure a CSV is loaded correctly. + */ +BOOST_AUTO_TEST_CASE(CustomStrategyTest) +{ + fstream f; + f.open("test_file.csv", fstream::out); + f << "1, 2, 3, 4" << endl; + f << "5, 6, 7, 8" << endl; + f.close(); + + arma::mat test; + using Mapper = DatasetMapper; + Mapper info; + BOOST_REQUIRE(data::Load("test_file.csv", test) == true); + + Imputer, Mapper, CustomStrategy> impu(info); + impu.template Impute(input, + output, + missingValue, + customValue, + feature); + + BOOST_REQUIRE_EQUAL(test.n_rows, 4); + BOOST_REQUIRE_EQUAL(test.n_cols, 2); + + for (int i = 0; i < 8; i++) + BOOST_REQUIRE_CLOSE(test[i], (double) (i + 1), 1e-5); + + // Remove the file. + remove("test_file.csv"); +} + +/** + * Make sure a CSV is loaded correctly. + */ +BOOST_AUTO_TEST_CASE(MeanStrategyTestt) +{ + fstream f; + f.open("test_file.csv", fstream::out); + f << "1, 2, 3, 4" << endl; + f << "5, 6, 7, 8" << endl; + f.close(); + + arma::mat test; + using Mapper = DatasetMapper; + Mapper info; + BOOST_REQUIRE(data::Load("test_file.csv", test) == true); + + Imputer, Mapper, CustomStrategy> impu(info); + impu.template Impute(input, + output, + missingValue, + customValue, + feature); + + BOOST_REQUIRE_EQUAL(test.n_rows, 4); + BOOST_REQUIRE_EQUAL(test.n_cols, 2); + + for (int i = 0; i < 8; i++) + BOOST_REQUIRE_CLOSE(test[i], (double) (i + 1), 1e-5); + + // Remove the file. + remove("test_file.csv"); +} + +/** + * Make sure a CSV is loaded correctly. + */ +BOOST_AUTO_TEST_CASE(MedianStrategyTestt) +{ + fstream f; + f.open("test_file.csv", fstream::out); + f << "1, 2, 3, 4" << endl; + f << "5, 6, 7, 8" << endl; + f.close(); + + arma::mat test; + using Mapper = DatasetMapper; + Mapper info; + BOOST_REQUIRE(data::Load("test_file.csv", test) == true); + + Imputer, Mapper, CustomStrategy> impu(info); + impu.template Impute(input, + output, + missingValue, + customValue, + feature); + + BOOST_REQUIRE_EQUAL(test.n_rows, 4); + BOOST_REQUIRE_EQUAL(test.n_cols, 2); + + for (int i = 0; i < 8; i++) + BOOST_REQUIRE_CLOSE(test[i], (double) (i + 1), 1e-5); + + // Remove the file. + remove("test_file.csv"); +} + +/** + * Make sure a CSV is loaded correctly. + */ +BOOST_AUTO_TEST_CASE(ModeStrategyTestt) +{ + fstream f; + f.open("test_file.csv", fstream::out); + f << "1, 2, 3, 4" << endl; + f << "5, 6, 7, 8" << endl; + f.close(); + + arma::mat test; + using Mapper = DatasetMapper; + Mapper info; + BOOST_REQUIRE(data::Load("test_file.csv", test) == true); + + Imputer, Mapper, CustomStrategy> impu(info); + impu.template Impute(input, + output, + missingValue, + customValue, + feature); + + BOOST_REQUIRE_EQUAL(test.n_rows, 4); + BOOST_REQUIRE_EQUAL(test.n_cols, 2); + + for (int i = 0; i < 8; i++) + BOOST_REQUIRE_CLOSE(test[i], (double) (i + 1), 1e-5); + + // Remove the file. + remove("test_file.csv"); +} + + +BOOST_AUTO_TEST_SUITE_END(); diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp index 3917aead1c7..af4e371aa72 100644 --- a/src/mlpack/tests/load_save_test.cpp +++ b/src/mlpack/tests/load_save_test.cpp @@ -1401,6 +1401,8 @@ BOOST_AUTO_TEST_CASE(HarderKeonTest) BOOST_REQUIRE_EQUAL(ntInfo.NumMappings(1), 5); BOOST_REQUIRE_EQUAL(ntInfo.NumMappings(2), 5); BOOST_REQUIRE_EQUAL(ntInfo.NumMappings(3), 3); + + remove("test.csv"); } /** From d8618ec909005b0453f0a3803ac1e6cc5e588f1f Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Sat, 18 Jun 2016 13:05:39 +0900 Subject: [PATCH 14/40] restructure, add listwise deletion & imputer tests --- .../CMakeLists.txt | 8 +- .../imputation_methods/custom_imputation.hpp | 58 +++++ .../imputation_methods/listwise_deletion.hpp | 60 +++++ .../imputation_methods/mean_imputation.hpp | 91 +++++++ .../imputation_methods/median_imputation.hpp | 62 +++++ .../impute_strategies/custom_strategy.hpp | 26 -- .../data/impute_strategies/mean_strategy.hpp | 62 ----- .../impute_strategies/median_strategy.hpp | 46 ---- .../data/impute_strategies/mode_strategy.hpp | 38 --- src/mlpack/core/data/imputer.hpp | 103 ++------ .../data/map_policies/increment_policy.hpp | 3 + .../core/data/map_policies/missing_policy.hpp | 5 +- .../preprocess/preprocess_imputer_main.cpp | 46 ++-- src/mlpack/tests/CMakeLists.txt | 2 +- src/mlpack/tests/imputation_test.cpp | 242 ++++++++++++++++++ src/mlpack/tests/imputer_test.cpp | 160 ------------ 16 files changed, 572 insertions(+), 440 deletions(-) rename src/mlpack/core/data/{impute_strategies => imputation_methods}/CMakeLists.txt (81%) create mode 100644 src/mlpack/core/data/imputation_methods/custom_imputation.hpp create mode 100644 src/mlpack/core/data/imputation_methods/listwise_deletion.hpp create mode 100644 src/mlpack/core/data/imputation_methods/mean_imputation.hpp create mode 100644 src/mlpack/core/data/imputation_methods/median_imputation.hpp delete mode 100644 src/mlpack/core/data/impute_strategies/custom_strategy.hpp delete mode 100644 src/mlpack/core/data/impute_strategies/mean_strategy.hpp delete mode 100644 src/mlpack/core/data/impute_strategies/median_strategy.hpp delete mode 100644 src/mlpack/core/data/impute_strategies/mode_strategy.hpp create mode 100644 src/mlpack/tests/imputation_test.cpp delete mode 100644 src/mlpack/tests/imputer_test.cpp diff --git a/src/mlpack/core/data/impute_strategies/CMakeLists.txt b/src/mlpack/core/data/imputation_methods/CMakeLists.txt similarity index 81% rename from src/mlpack/core/data/impute_strategies/CMakeLists.txt rename to src/mlpack/core/data/imputation_methods/CMakeLists.txt index ae3c9b9c66c..a3993192ec2 100644 --- a/src/mlpack/core/data/impute_strategies/CMakeLists.txt +++ b/src/mlpack/core/data/imputation_methods/CMakeLists.txt @@ -1,10 +1,10 @@ # Define the files we need to compile # Anything not in this list will not be compiled into mlpack. set(SOURCES - mean_strategy.hpp - mode_strategy.hpp - median_strategy.hpp - custom_strategy.hpp + custom_imputation.hpp + listwise_deletion.hpp + mean_imputation.hpp + median_imputation.hpp ) # Add directory name to sources. diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp new file mode 100644 index 00000000000..c8197d64a9d --- /dev/null +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -0,0 +1,58 @@ +/** + * @file custom_imputation.hpp + * @author Keon Kim + * + * Definition and Implementation of the empty CustomImputation class. + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_IMPUTATION_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_IMPUTATION_HPP + +#include + +using namespace std; + +namespace mlpack { +namespace data { + +template +class CustomImputation +{ + public: + void Apply(const arma::Mat& input, + arma::Mat& output, + const T& mappedValue, + const T& customValue, + const size_t dimension, + const bool transpose = true) + { + // initiate output + output = input; + + // replace the target value to custom value + if (transpose) + { + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(dimension, i) == mappedValue) + { + output(dimension, i) = customValue; + } + } + } + else + { + for (size_t i = 0; i < input.n_cols; ++i) + { + if (input(i, dimension) == mappedValue) + { + output(i, dimension) = customValue; + } + } + } + } +}; // class CustomImputation + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp new file mode 100644 index 00000000000..f089da19331 --- /dev/null +++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp @@ -0,0 +1,60 @@ +/** + * @file listwise_deletion.hpp + * @author Keon Kim + * + * Definition and Implementation of the empty ListwiseDeletion class. + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_LISTWISE_DELETION_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_LISTWISE_DELETION_HPP + +#include + +using namespace std; + +namespace mlpack { +namespace data { + +/** + * complete-case analysis. + * Removes all data for a case that has one or more missing values. + */ +template +class ListwiseDeletion +{ + public: + void Apply(const arma::Mat& input, + arma::Mat& output, + const T& mappedValue, + const size_t dimension, + const bool transpose = true) + { + // initiate output + output = input; + + if (transpose) + { + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(dimension, i) == mappedValue) + { + output.shed_row(i); + } + } + } + else + { + for (size_t i = 0; i < input.n_cols; ++i)\ + { + if (input(dimension, i) == mappedValue) + { + output.shed_col(i); + } + } + } + } +}; // class ListwiseDeletion + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp new file mode 100644 index 00000000000..7b9c9356011 --- /dev/null +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -0,0 +1,91 @@ +/** + * @file mean_imputation.hpp + * @author Keon Kim + * + * Definition and Implementation of the MeanImputation class. + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_IMPUTATION_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_IMPUTATION_HPP + +#include + +using namespace std; + +namespace mlpack { +namespace data { + +/** + * A simple mean imputation + */ +template +class MeanImputation +{ + public: + void Apply (const arma::Mat& input, + arma::Mat& output, + const T& mappedValue, + const size_t dimension, + const bool transpose = true) + { + // initiate output + output = input; + + double sum; + size_t elems = 0; // excluding nan or missing target + + using PairType = std::pair; + // dimensions and indexes are saved as pairs inside this vector. + std::vector targets; + + // calculate number of elements and sum of them excluding mapped value or + // nan. while doing that, remember where mappedValue or NaN exists. + if (transpose) + { + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(i, dimension) == mappedValue) + { + targets.push_back(std::make_pair(i, dimension)); + } + else + { + elems++; + sum += input(i, dimension); + } + } + } + else + { + for (size_t i = 0; i < input.n_cols; ++i) + { + if (input(dimension, i) == mappedValue) + { + targets.push_back(std::make_pair(dimension, i)); + } + else + { + elems++; + sum += input(i, dimension); + } + } + } + + // calculate mean; + double mean = sum / elems; + + // Now replace the calculated mean to the missing variables + // It only needs to loop through targets vector, not the whole matrix. + for (const PairType& target : targets) + { + if (input(target.first, target.second) == mappedValue) + { + output(target.first, target.second) = mean; + } + } + } +}; // class MeanImputation + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp new file mode 100644 index 00000000000..84c542545e7 --- /dev/null +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -0,0 +1,62 @@ +/** + * @file median_imputation.hpp + * @author Keon Kim + * + * Definition and Implementation of the MedianImputation class. + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEDIAN_IMPUTATION_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEDIAN_IMPUTATION_HPP + +#include + +using namespace std; + +namespace mlpack { +namespace data { + +/** + * A simple median imputation + * replace missing value with middle or average of middle values + */ +template +class MedianImputation +{ + public: + void Apply (const arma::Mat& input, + arma::Mat& output, + const T& mappedValue, + const size_t dimension, + const bool transpose = true) + { + //initiate output + output = input; + + if (transpose) + { + arma::Mat medianMat = arma::median(input, 1); + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(dimension, i) == mappedValue) + { + output(dimension, i) = medianMat(0, i); + } + } + } + else + { + arma::Mat medianMat = arma::median(input, 0); + for (size_t i = 0; i < input.n_cols; ++i) + { + if (input(i, dimension) == mappedValue) + { + output(i, dimension) = medianMat(i, 0); + } + } + } + } +}; // class MeanImputation + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/impute_strategies/custom_strategy.hpp b/src/mlpack/core/data/impute_strategies/custom_strategy.hpp deleted file mode 100644 index a8685f3ef2d..00000000000 --- a/src/mlpack/core/data/impute_strategies/custom_strategy.hpp +++ /dev/null @@ -1,26 +0,0 @@ -/** - * @file custom_strategy.hpp - * @author Keon Kim - * - * Definition and Implementation of the empty CustomStrategy class. - */ -#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_STRATEGY_HPP -#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_STRATEGY_HPP - -#include - - -using namespace std; - -namespace mlpack { -namespace data { - -class CustomStrategy -{ - // empty class -}; - -} // namespace data -} // namespace mlpack - -#endif diff --git a/src/mlpack/core/data/impute_strategies/mean_strategy.hpp b/src/mlpack/core/data/impute_strategies/mean_strategy.hpp deleted file mode 100644 index 223328a7df2..00000000000 --- a/src/mlpack/core/data/impute_strategies/mean_strategy.hpp +++ /dev/null @@ -1,62 +0,0 @@ -/** - * @file mean_strategy.hpp - * @author Keon Kim - * - * Definition and Implementation of the MeanStrategy class. - */ -#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_STRATEGY_HPP -#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_STRATEGY_HPP - -#include - - -using namespace std; - -namespace mlpack { -namespace data { - -/** - * The MeanStrategy - */ -class MeanStrategy -{ - public: - - /** - * Computes mean, excluding NaN or target missing variables - * - * TODO: write docs for parameters - * @param input - * @param output - * @param dimension - * @param index - * @param transpose - */ - template - void Impute(const MatType &input, - MatType &output, - const size_t dimension, - const size_t index, - const bool transpose = true) - { - if (transpose) - { - // TODO: The mean must be calculated - // without NaN or target missing variable. - MatType meanMat = arma::mean(input, 1); - output(dimension, index) = meanMat(dimension); - } - else - { - // TODO: The mean must be calculated - // without NaN or target missing variable. - MatType meanMat = arma::mean(input, 0); - output(index, dimension) = meanMat(index); - } - } -}; // class MeanStrategy - -} // namespace data -} // namespace mlpack - -#endif diff --git a/src/mlpack/core/data/impute_strategies/median_strategy.hpp b/src/mlpack/core/data/impute_strategies/median_strategy.hpp deleted file mode 100644 index 226df31ee34..00000000000 --- a/src/mlpack/core/data/impute_strategies/median_strategy.hpp +++ /dev/null @@ -1,46 +0,0 @@ -/** - * @file median_strategy.hpp - * @author Keon Kim - * - * Definition and Implementation of the MedianStrategy class. - */ -#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEDIAN_STRATEGY_HPP -#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEDIAN_STRATEGY_HPP - - -#include - - -using namespace std; - -namespace mlpack { -namespace data { - -class MedianStrategy -{ - public: - - template - void Impute(const MatType &input, - MatType &output, - const size_t dimension, - const size_t index, - const bool transpose = true) - { - if (transpose) - { - MatType medianMat = arma::median(input, 1); - output(dimension, index) = medianMat(dimension); - } - else - { - MatType medianMat = arma::median(input, 0); - output(index, dimension) = medianMat(index); - } - } -}; - -} // namespace data -} // namespace mlpack - -#endif diff --git a/src/mlpack/core/data/impute_strategies/mode_strategy.hpp b/src/mlpack/core/data/impute_strategies/mode_strategy.hpp deleted file mode 100644 index f78ad784c3e..00000000000 --- a/src/mlpack/core/data/impute_strategies/mode_strategy.hpp +++ /dev/null @@ -1,38 +0,0 @@ -/** - * @file mode_strategy.hpp - * @author Keon Kim - * - * Definition and Implementation of the ModeStrategy class. - */ -#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MODE_STRATEGY_HPP -#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MODE_STRATEGY_HPP - -#include - - -using namespace std; - -namespace mlpack { -namespace data { - -class ModeStrategy -{ - public: - template - void Impute(const arma::Mat &input, - arma::Mat &output, - const size_t dimension, - const size_t index) - { - // TODO: implement this - // considering use of arma::hist() - output(dimension, index) = 99; - cout << "IMPUTE CALLED CUSTOM MAP STRATEGY" << endl; - - } -}; - -} // namespace data -} // namespace mlpack - -#endif diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index 10311271c34..c298d051719 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -9,36 +9,35 @@ #define MLPACK_CORE_DATA_IMPUTER_HPP #include -#include namespace mlpack { namespace data { /** * This class implements a way to replace target values. It is dependent on the - * user defined Strategy and Mapper used to hold dataset's information. + * user defined StrategyType and MapperType used to hold dataset's information. * * @tparam Option of imputation strategy. - * @tparam Mapper that is used to hold dataset information. + * @tparam MapperType that is used to hold dataset information. * @tparam primitive type of input and output's armadillo matrix. */ -template +template class Imputer { public: - Imputer(Mapper mapper, bool transpose =true): + Imputer(MapperType mapper, bool transpose = true): mapper(std::move(mapper)), transpose(transpose) { - // nothing to initialize here + // nothing to initialize here } - Imputer(Strategy strat, Mapper mapper, bool traspose = true): - strat(std::move(strat)), + Imputer(MapperType mapper, StrategyType strategy, bool transpose = true): + strategy(std::move(strategy)), mapper(std::move(mapper)), transpose(transpose) { - // nothing to initialize here + // nothing to initialize here } /** @@ -52,93 +51,47 @@ class Imputer * @param dimension. * @param transpose. */ - void Impute(const MatType &input, - MatType &output, - const std::string &targetValue, + void Impute(const arma::Mat& input, + arma::Mat& output, + const std::string& missingValue, const size_t dimension) { - // find mapped value inside current mapper - auto mappedValue = mapper.UnmapValue(targetValue, dimension); - - if(transpose) - { - for (size_t i = 0; i < input.n_rows; ++i) - { - if (input(dimension, i) == mappedValue) - { - // users can specify the imputation strategies likes - // mean, mode, etc using the class'es template parameter: Strategy. - strat.template Impute(input, output, dimension, i, transpose); - } - } - } - else - { - for (size_t i = 0; i < input.n_cols; ++i) - { - if (input(i, dimension) == mappedValue) - { - strat.template Impute(input, output, i, dimension, transpose); - } - } - } + T mappedValue = static_cast(mapper.UnmapValue(missingValue, dimension)); + strategy.Apply(input, output, mappedValue, dimension, transpose); } /** * This overload of Impute() lets users to define custom value that * can be replaced with the target value. */ - template - void Impute(const arma::Mat &input, - arma::Mat &output, - const std::string &targetValue, - const T &customValue, + void Impute(const arma::Mat& input, + arma::Mat& output, + const std::string& missingValue, + const T& customValue, const size_t dimension) { - // find mapped value inside current mapper - auto mappedValue = mapper.UnmapValue(targetValue, dimension); - - if(transpose) - { - for (size_t i = 0; i < input.n_rows; ++i) - { - if (input(dimension, i) == mappedValue) - { - // replace the target value to custom value - output(dimension, i) = customValue; - } - } - } - else - { - for (size_t i = 0; i < input.n_cols; ++i) - { - if (input(i, dimension) == mappedValue) - { - output(i, dimension) = customValue; - } - } - } + T mappedValue = static_cast(mapper.UnmapValue(missingValue, dimension)); + strategy.Apply(input, output, mappedValue, customValue, dimension, transpose); } //! Get the strategy - const Strategy& Strategy() const { return strat } + const StrategyType& Strategy() const { return strategy; } - //! Modify the given strategy (be careful!) - Strategy& Strategy() { return strat } + //! Modify the given given strategy (be careful!) + StrategyType& Strategy() { return strategy; } //! Get the mapper - const Mapper& Mapper() const { return mapper } + const MapperType& Mapper() const { return mapper; } //! Modify the given mapper (be careful!) - Mapper& Mapper() { return mapper } + MapperType& Mapper() { return mapper; } private: - // Imputation Strategy - Strategy strat; + // StrategyType + StrategyType strategy; - // DatasetMapper - Mapper mapper; + // DatasetMapperType + MapperType mapper; // save transpose as a member variable since it is rarely changed. bool transpose; diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp index b83e832b95e..f0b1d7094f1 100644 --- a/src/mlpack/core/data/map_policies/increment_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_policy.hpp @@ -40,8 +40,11 @@ class IncrementPolicy { // This string does not exist yet. size_t& numMappings = maps[dimension].second; + + // change type of the feature to categorical if (numMappings == 0) types[dimension] = Datatype::categorical; + typedef boost::bimap::value_type PairType; maps[dimension].first.insert(PairType(string, numMappings)); return numMappings++; diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index c8be3bd8c2a..b7e063083be 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -19,7 +19,7 @@ namespace mlpack { namespace data { /** - * Same as increment map policy so far. + * Same as increment map policy, but does not change type of features. */ class MissingPolicy { @@ -40,8 +40,7 @@ class MissingPolicy { // This string does not exist yet. size_t& numMappings = maps[dimension].second; - //if (numMappings == 0) - //types[dimension] = Datatype::categorical; + typedef boost::bimap::value_type PairType; maps[dimension].first.insert(PairType(string, numMappings)); return numMappings++; diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index bd96a7e0154..f4230362144 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -9,7 +9,10 @@ #include #include #include -#include +#include +#include +#include +#include PROGRAM_INFO("Imputer", "This " "utility takes an any type of data and provides " @@ -44,12 +47,12 @@ int main(int argc, char** argv) // missing value should be specified if (!CLI::HasParam("missing_value")) Log::Fatal << "--missing_value must be specified in order to perform " - << "any imputation strategies." << endl; + << "any imputation strategies." << endl; // warn if user did not specify output_file if (!CLI::HasParam("output_file")) Log::Warn << "--output_file is not specified, no " - << "results from this program will be saved!" << endl; + << "results from this program will be saved!" << endl; // if custom value is specified, and imputation strategy is not, // set imputation strategy to "custom" @@ -57,8 +60,7 @@ int main(int argc, char** argv) { imputeStrategy = "custom"; Log::Warn << "--custom_value is specified without --impute_strategy, " - << "--impute_strategy is automatically set to 'custom'." - << endl; + << "--impute_strategy is automatically set to 'custom'." << endl; } // custom value and any other impute strategies cannot be specified at @@ -66,19 +68,19 @@ int main(int argc, char** argv) if (CLI::HasParam("custom_value") && CLI::HasParam("impute_strategy") && imputeStrategy != "custom") Log::Fatal << "--custom_value cannot be specified with " - << "impute strategies excluding 'custom' strategy" << endl; + << "impute strategies excluding 'custom' strategy" << endl; // custom_value must be specified when using "custom" imputation strategy if ((imputeStrategy == "custom") && !CLI::HasParam("custom_value")) Log::Fatal << "--custom_value must be specified when using " - << "'custom' strategy" << endl; + << "'custom' strategy" << endl; arma::mat input; // DatasetInfo holds how the DatasetMapper should map the values. // can be specified by passing map_policy classes as template parameters // ex) DatasetMapper info; - using Mapper = DatasetMapper; - Mapper info; + using MapperType = DatasetMapper; + MapperType info; Load(inputFile, input, info, true, true); @@ -88,36 +90,30 @@ int main(int argc, char** argv) // print how many mapping exist in each features for (size_t i = 0; i < input.n_rows; ++i) { - Log::Info << info.NumMappings(i) << " mappings in feature " - << i << "." << endl; + Log::Info << info.NumMappings(i) << " mappings in feature " << i << "." + << endl; } - arma::Mat output(input); Log::Info << "Performing '" << imputeStrategy << "' imputation strategy " - << "to feature '" << feature <<"' of '" << inputFile << "'." - << endl; + << "on feature '" << feature << endl; + // custom strategy only if (imputeStrategy == "custom") { Log::Info << "Replacing all '" << missingValue << "' with '" << customValue - << "'." << endl; - - Imputer, Mapper, CustomStrategy> impu(info); - impu.template Impute(input, - output, - missingValue, - customValue, - feature); + << "'." << endl; + Imputer> impu(info); + impu.Impute(input, output, missingValue, customValue, feature); } else { - Log::Info << "Replacing all '" << missingValue << "' with '" << imputeStrategy - << "'." << endl; + Log::Info << "Replacing all '" << missingValue << "' with '" + << imputeStrategy << "'." << endl; - Imputer, Mapper, MeanStrategy> impu(info); + Imputer> impu(info); impu.Impute(input, output, missingValue, feature); } diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index 034b15eee2d..0b2e6602874 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -22,7 +22,7 @@ add_executable(mlpack_test gmm_test.cpp hmm_test.cpp hoeffding_tree_test.cpp - imputer_test.cpp + imputation_test.cpp ind2sub_test.cpp init_rules_test.cpp kernel_test.cpp diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp new file mode 100644 index 00000000000..02f56b62b06 --- /dev/null +++ b/src/mlpack/tests/imputation_test.cpp @@ -0,0 +1,242 @@ +/** + * @file imputation_test.cpp + * @author Keon Kim + * + * Tests for data::Imputer class + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "old_boost_test_definitions.hpp" + +using namespace mlpack; +using namespace mlpack::data; +using namespace std; + +BOOST_AUTO_TEST_SUITE(ImputationTest); + +/** + * Make sure a CSV is loaded correctly. + */ +BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) +{ + fstream f; + f.open("test_file.csv", fstream::out); + f << "a, 2, 3" << endl; + f << "5, 6, 7" << endl; + f << "8, 9, 10" << endl; + f.close(); + + arma::mat input; + arma::mat output; + string missingValue = "a"; + double customValue = 99; + size_t feature = 0; + + DatasetInfo info; + BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true); + + BOOST_REQUIRE_EQUAL(input.n_rows, 3); + BOOST_REQUIRE_EQUAL(input.n_cols, 3); + + /* TODO: Connect Load with the new DatasetMapper instead of DatasetInfo*/ + + Imputer> impu(info); + impu.Impute(input, output, missingValue, customValue, feature); + // Remove the file. + remove("test_file.csv"); +} + +/** + * Make sure a CSV is loaded correctly. + */ +BOOST_AUTO_TEST_CASE(CustomImputationTest) +{ + arma::mat input("3.0 0.0 2.0 0.0;" + "5.0 6.0 0.0 6.0;" + "9.0 8.0 4.0 8.0;"); + arma::mat outputT; // assume input is transposed + arma::mat output; // assume input is not transposed + double customValue = 99; + double mappedValue = 0.0; + + CustomImputation imputer; + + // transposed + imputer.Apply(input, outputT, mappedValue, customValue, 0/*dimension*/, true); + + BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(0, 1), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(0, 3), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); + + // not transposed + imputer.Apply(input, output, mappedValue, customValue, 1/*dimension*/, false); + + BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 1), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5); +} + +/** + * Make sure a CSV is loaded correctly. + */ +BOOST_AUTO_TEST_CASE(MeanImputationTest) +{ + arma::mat input("3.0 0.0 2.0 0.0;" + "5.0 6.0 0.0 6.0;" + "9.0 8.0 4.0 8.0;"); + arma::mat outputT; // assume input is transposed + arma::mat output; // assume input is not transposed + double mappedValue = 0.0; + + MeanImputation imputer; + + // transposed + imputer.Apply(input, outputT, mappedValue, 0/*dimension*/, true); + + BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(0, 1), 2.5, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(0, 3), 2.5, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); + + // not transposed + imputer.Apply(input, output, mappedValue, 1/*dimension*/, false); + + BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 1), 7.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5); +} + +/** + * Make sure a CSV is loaded correctly. + */ +BOOST_AUTO_TEST_CASE(MedianImputationTest) +{ + arma::mat input("3.0 0.0 2.0 0.0;" + "5.0 6.0 0.0 6.0;" + "9.0 8.0 4.0 8.0;"); + arma::mat outputT; // assume input is transposed + arma::mat output; // assume input is not transposed + double mappedValue = 0.0; + + MedianImputation imputer; + + // transposed + imputer.Apply(input, outputT, mappedValue, 1/*dimension*/, true); + + BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(0, 1), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 2), 5.5, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); + + // not transposed + imputer.Apply(input, output, mappedValue, 1/*dimension*/, false); + + BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5); +} + +/** + * Make sure a CSV is loaded correctly. + */ +BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) +{ + arma::mat input("3.0 0.0 2.0 0.0;" + "5.0 6.0 0.0 6.0;" + "9.0 8.0 4.0 8.0;"); + arma::mat outputT; // assume input is transposed + arma::mat output; // assume input is not transposed + double mappedValue = 0.0; + + ListwiseDeletion imputer; + + // transposed + imputer.Apply(input, outputT, mappedValue, 0, true); // transposed + + BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(0, 1), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 1), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(2, 1), 4.0, 1e-5); + + // not transposed + imputer.Apply(input, output, mappedValue, 1, false); // not transposed + + BOOST_REQUIRE_CLOSE(output(0, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 3), 8.0, 1e-5); +} + + +BOOST_AUTO_TEST_SUITE_END(); diff --git a/src/mlpack/tests/imputer_test.cpp b/src/mlpack/tests/imputer_test.cpp deleted file mode 100644 index 0009917df37..00000000000 --- a/src/mlpack/tests/imputer_test.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/** - * @file imputer_test.cpp - * @author Keon Kim - * - * Tests for data::Imputer class - */ -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "old_boost_test_definitions.hpp" - -using namespace mlpack; -using namespace mlpack::data; -using namespace std; - -BOOST_AUTO_TEST_SUITE(ImputerTest); - -/** - * Make sure a CSV is loaded correctly. - */ -BOOST_AUTO_TEST_CASE(CustomStrategyTest) -{ - fstream f; - f.open("test_file.csv", fstream::out); - f << "1, 2, 3, 4" << endl; - f << "5, 6, 7, 8" << endl; - f.close(); - - arma::mat test; - using Mapper = DatasetMapper; - Mapper info; - BOOST_REQUIRE(data::Load("test_file.csv", test) == true); - - Imputer, Mapper, CustomStrategy> impu(info); - impu.template Impute(input, - output, - missingValue, - customValue, - feature); - - BOOST_REQUIRE_EQUAL(test.n_rows, 4); - BOOST_REQUIRE_EQUAL(test.n_cols, 2); - - for (int i = 0; i < 8; i++) - BOOST_REQUIRE_CLOSE(test[i], (double) (i + 1), 1e-5); - - // Remove the file. - remove("test_file.csv"); -} - -/** - * Make sure a CSV is loaded correctly. - */ -BOOST_AUTO_TEST_CASE(MeanStrategyTestt) -{ - fstream f; - f.open("test_file.csv", fstream::out); - f << "1, 2, 3, 4" << endl; - f << "5, 6, 7, 8" << endl; - f.close(); - - arma::mat test; - using Mapper = DatasetMapper; - Mapper info; - BOOST_REQUIRE(data::Load("test_file.csv", test) == true); - - Imputer, Mapper, CustomStrategy> impu(info); - impu.template Impute(input, - output, - missingValue, - customValue, - feature); - - BOOST_REQUIRE_EQUAL(test.n_rows, 4); - BOOST_REQUIRE_EQUAL(test.n_cols, 2); - - for (int i = 0; i < 8; i++) - BOOST_REQUIRE_CLOSE(test[i], (double) (i + 1), 1e-5); - - // Remove the file. - remove("test_file.csv"); -} - -/** - * Make sure a CSV is loaded correctly. - */ -BOOST_AUTO_TEST_CASE(MedianStrategyTestt) -{ - fstream f; - f.open("test_file.csv", fstream::out); - f << "1, 2, 3, 4" << endl; - f << "5, 6, 7, 8" << endl; - f.close(); - - arma::mat test; - using Mapper = DatasetMapper; - Mapper info; - BOOST_REQUIRE(data::Load("test_file.csv", test) == true); - - Imputer, Mapper, CustomStrategy> impu(info); - impu.template Impute(input, - output, - missingValue, - customValue, - feature); - - BOOST_REQUIRE_EQUAL(test.n_rows, 4); - BOOST_REQUIRE_EQUAL(test.n_cols, 2); - - for (int i = 0; i < 8; i++) - BOOST_REQUIRE_CLOSE(test[i], (double) (i + 1), 1e-5); - - // Remove the file. - remove("test_file.csv"); -} - -/** - * Make sure a CSV is loaded correctly. - */ -BOOST_AUTO_TEST_CASE(ModeStrategyTestt) -{ - fstream f; - f.open("test_file.csv", fstream::out); - f << "1, 2, 3, 4" << endl; - f << "5, 6, 7, 8" << endl; - f.close(); - - arma::mat test; - using Mapper = DatasetMapper; - Mapper info; - BOOST_REQUIRE(data::Load("test_file.csv", test) == true); - - Imputer, Mapper, CustomStrategy> impu(info); - impu.template Impute(input, - output, - missingValue, - customValue, - feature); - - BOOST_REQUIRE_EQUAL(test.n_rows, 4); - BOOST_REQUIRE_EQUAL(test.n_cols, 2); - - for (int i = 0; i < 8; i++) - BOOST_REQUIRE_CLOSE(test[i], (double) (i + 1), 1e-5); - - // Remove the file. - remove("test_file.csv"); -} - - -BOOST_AUTO_TEST_SUITE_END(); From 3b8ffd0766cca7b60d5a6c552b8d464ae7ac3920 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Tue, 28 Jun 2016 06:38:44 +0900 Subject: [PATCH 15/40] fix transpose problem --- .../imputation_methods/custom_imputation.hpp | 4 +- .../imputation_methods/listwise_deletion.hpp | 8 +-- .../imputation_methods/mean_imputation.hpp | 29 +++++----- .../imputation_methods/median_imputation.hpp | 4 +- .../preprocess/preprocess_imputer_main.cpp | 55 +++++++++++++------ 5 files changed, 61 insertions(+), 39 deletions(-) diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp index c8197d64a9d..73100e20460 100644 --- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -31,7 +31,7 @@ class CustomImputation // replace the target value to custom value if (transpose) { - for (size_t i = 0; i < input.n_rows; ++i) + for (size_t i = 0; i < input.n_cols; ++i) { if (input(dimension, i) == mappedValue) { @@ -41,7 +41,7 @@ class CustomImputation } else { - for (size_t i = 0; i < input.n_cols; ++i) + for (size_t i = 0; i < input.n_rows; ++i) { if (input(i, dimension) == mappedValue) { diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp index f089da19331..a2de05d161c 100644 --- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp +++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp @@ -33,7 +33,7 @@ class ListwiseDeletion if (transpose) { - for (size_t i = 0; i < input.n_rows; ++i) + for (size_t i = 0; i < input.n_cols; ++i) { if (input(dimension, i) == mappedValue) { @@ -43,11 +43,11 @@ class ListwiseDeletion } else { - for (size_t i = 0; i < input.n_cols; ++i)\ + for (size_t i = 0; i < input.n_rows; ++i)\ { - if (input(dimension, i) == mappedValue) + if (input(i, dimension) == mappedValue) { - output.shed_col(i); + output.shed_col(dimension); } } } diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp index 7b9c9356011..b827405b64b 100644 --- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -30,37 +30,40 @@ class MeanImputation // initiate output output = input; - double sum; + double sum = 0; size_t elems = 0; // excluding nan or missing target using PairType = std::pair; // dimensions and indexes are saved as pairs inside this vector. std::vector targets; + // calculate number of elements and sum of them excluding mapped value or // nan. while doing that, remember where mappedValue or NaN exists. if (transpose) { - for (size_t i = 0; i < input.n_rows; ++i) + Log::Debug << "transpose mean imputation" << std::endl; + for (size_t i = 0; i < input.n_cols; ++i) { - if (input(i, dimension) == mappedValue) + if (input(dimension, i) == mappedValue) { - targets.push_back(std::make_pair(i, dimension)); + targets.emplace_back(dimension, i); } else { elems++; - sum += input(i, dimension); + sum += input(dimension, i); } } } else { - for (size_t i = 0; i < input.n_cols; ++i) + Log::Debug << "un-transpose mean imputation" << std::endl; + for (size_t i = 0; i < input.n_rows; ++i) { - if (input(dimension, i) == mappedValue) + if (input(i, dimension) == mappedValue) { - targets.push_back(std::make_pair(dimension, i)); + targets.emplace_back(i, dimension); } else { @@ -69,18 +72,16 @@ class MeanImputation } } } - + Log::Debug << "sum: " << sum << std::endl; + Log::Debug << "elems: " << elems << std::endl; // calculate mean; - double mean = sum / elems; + const double mean = sum / elems; // Now replace the calculated mean to the missing variables // It only needs to loop through targets vector, not the whole matrix. for (const PairType& target : targets) { - if (input(target.first, target.second) == mappedValue) - { - output(target.first, target.second) = mean; - } + output(target.first, target.second) = mean; } } }; // class MeanImputation diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp index 84c542545e7..0a5910358ac 100644 --- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -34,7 +34,7 @@ class MedianImputation if (transpose) { arma::Mat medianMat = arma::median(input, 1); - for (size_t i = 0; i < input.n_rows; ++i) + for (size_t i = 0; i < input.n_cols; ++i) { if (input(dimension, i) == mappedValue) { @@ -45,7 +45,7 @@ class MedianImputation else { arma::Mat medianMat = arma::median(input, 0); - for (size_t i = 0; i < input.n_cols; ++i) + for (size_t i = 0; i < input.n_rows; ++i) { if (input(i, dimension) == mappedValue) { diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index f4230362144..b833ab1833f 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -22,9 +22,9 @@ PARAM_STRING_REQ("input_file", "File containing data,", "i"); PARAM_STRING("output_file", "File to save output", "o", ""); PARAM_STRING("missing_value", "User defined missing value", "m", "") PARAM_STRING("map_policy", "mapping policy to be used while loading", "p", "") -PARAM_STRING("impute_strategy", "imputation strategy to be applied", "s", "") +PARAM_STRING("strategy", "imputation strategy to be applied", "s", "") PARAM_DOUBLE("custom_value", "user_defined custom value", "c", 0.0) -PARAM_INT("feature", "the feature to apply imputation", "f", 0); +PARAM_INT("dimension", "the dimension to apply imputation", "d", 0); using namespace mlpack; using namespace arma; @@ -41,8 +41,8 @@ int main(int argc, char** argv) const string missingValue = CLI::GetParam("missing_value"); const string mapPolicy = CLI::GetParam("map_policy"); const double customValue = CLI::GetParam("custom_value"); - const size_t feature = (size_t) CLI::GetParam("feature"); - string imputeStrategy = CLI::GetParam("impute_strategy"); + const size_t dimension = (size_t) CLI::GetParam("dimension"); + string strategy = CLI::GetParam("strategy"); // missing value should be specified if (!CLI::HasParam("missing_value")) @@ -54,11 +54,15 @@ int main(int argc, char** argv) Log::Warn << "--output_file is not specified, no " << "results from this program will be saved!" << endl; + // warn if user did not specify dimension + if (!CLI::HasParam("dimension")) + Log::Warn << "--dimension is required to be specified!" << endl; + // if custom value is specified, and imputation strategy is not, // set imputation strategy to "custom" if (CLI::HasParam("custom_value") && !CLI::HasParam("impute_strategy")) { - imputeStrategy = "custom"; + strategy = "custom"; Log::Warn << "--custom_value is specified without --impute_strategy, " << "--impute_strategy is automatically set to 'custom'." << endl; } @@ -66,12 +70,12 @@ int main(int argc, char** argv) // custom value and any other impute strategies cannot be specified at // the same time. if (CLI::HasParam("custom_value") && CLI::HasParam("impute_strategy") && - imputeStrategy != "custom") + strategy != "custom") Log::Fatal << "--custom_value cannot be specified with " << "impute strategies excluding 'custom' strategy" << endl; // custom_value must be specified when using "custom" imputation strategy - if ((imputeStrategy == "custom") && !CLI::HasParam("custom_value")) + if ((strategy == "custom") && !CLI::HasParam("custom_value")) Log::Fatal << "--custom_value must be specified when using " << "'custom' strategy" << endl; @@ -87,34 +91,51 @@ int main(int argc, char** argv) // for testing purpose Log::Info << input << endl; - // print how many mapping exist in each features + // print how many mapping exist in each dimensions for (size_t i = 0; i < input.n_rows; ++i) { - Log::Info << info.NumMappings(i) << " mappings in feature " << i << "." + Log::Info << info.NumMappings(i) << " mappings in dimension " << i << "." << endl; } arma::Mat output(input); - Log::Info << "Performing '" << imputeStrategy << "' imputation strategy " - << "on feature '" << feature << endl; + Log::Info << "Performing '" << strategy << "' imputation strategy " + << "on dimension '" << dimension << endl; // custom strategy only - if (imputeStrategy == "custom") + if (strategy == "custom") { Log::Info << "Replacing all '" << missingValue << "' with '" << customValue << "'." << endl; Imputer> impu(info); - impu.Impute(input, output, missingValue, customValue, feature); + impu.Impute(input, output, missingValue, customValue, dimension); } else { Log::Info << "Replacing all '" << missingValue << "' with '" - << imputeStrategy << "'." << endl; - - Imputer> impu(info); - impu.Impute(input, output, missingValue, feature); + << strategy << "' strategy." << endl; + + if (strategy == "mean") + { + Imputer> impu(info); + impu.Impute(input, output, missingValue, dimension); + } + else if (strategy == "median") + { + Imputer> impu(info); + impu.Impute(input, output, missingValue, dimension); + } + else if (strategy == "listwise") + { + Imputer> impu(info); + impu.Impute(input, output, missingValue, dimension); + } + else + { + Log::Warn << "You did not choose any imputation strategy" << endl; + } } // for testing purpose From e09d9bc713ebc66a95a2ed824f60ed3f1b09ceff Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Tue, 28 Jun 2016 14:14:06 +0900 Subject: [PATCH 16/40] updates and fixes on imputation methods --- .../imputation_methods/mean_imputation.hpp | 9 ++-- src/mlpack/core/data/imputer.hpp | 41 +++++++++++-------- src/mlpack/tests/imputation_test.cpp | 2 +- 3 files changed, 30 insertions(+), 22 deletions(-) diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp index b827405b64b..43f14a045ad 100644 --- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -42,7 +42,6 @@ class MeanImputation // nan. while doing that, remember where mappedValue or NaN exists. if (transpose) { - Log::Debug << "transpose mean imputation" << std::endl; for (size_t i = 0; i < input.n_cols; ++i) { if (input(dimension, i) == mappedValue) @@ -58,7 +57,6 @@ class MeanImputation } else { - Log::Debug << "un-transpose mean imputation" << std::endl; for (size_t i = 0; i < input.n_rows; ++i) { if (input(i, dimension) == mappedValue) @@ -72,8 +70,11 @@ class MeanImputation } } } - Log::Debug << "sum: " << sum << std::endl; - Log::Debug << "elems: " << elems << std::endl; + + if (elems == 0) + Log::Fatal << "it is impossible to calculate mean; no valid elements in " + << "the dimension" << std::endl; + // calculate mean; const double mean = sum / elems; diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index c298d051719..b7f4bee5eea 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -2,8 +2,8 @@ * @file imputer.hpp * @author Keon Kim * - * Defines Imputer(), a utility function to replace missing variables - * in a dataset. + * Defines Imputer class a utility function to replace missing variables in a + * dataset. */ #ifndef MLPACK_CORE_DATA_IMPUTER_HPP #define MLPACK_CORE_DATA_IMPUTER_HPP @@ -14,12 +14,12 @@ namespace mlpack { namespace data { /** - * This class implements a way to replace target values. It is dependent on the - * user defined StrategyType and MapperType used to hold dataset's information. + * Given a dataset of a particular datatype, replace user-specified missing + * value with a variable dependent on the StrategyType and MapperType. * - * @tparam Option of imputation strategy. - * @tparam MapperType that is used to hold dataset information. - * @tparam primitive type of input and output's armadillo matrix. + * @tparam T Type of armadillo matrix used for imputation strategy. + * @tparam MapperType DatasetMapper that is used to hold dataset information. + * @tparam StrategyType Imputation strategy used. */ template class Imputer @@ -29,7 +29,9 @@ class Imputer mapper(std::move(mapper)), transpose(transpose) { - // nothing to initialize here + //static_assert(std::is_same::type, + //data::IncrementPolicy>::value, "The type of MapperType must be " + //"IncrementPolicy"); } Imputer(MapperType mapper, StrategyType strategy, bool transpose = true): @@ -37,7 +39,9 @@ class Imputer mapper(std::move(mapper)), transpose(transpose) { - // nothing to initialize here + //static_assert(std::is_same::type, + //data::IncrementPolicy>::value, "The type of MapperType must be " + //"IncrementPolicy"); } /** @@ -45,11 +49,9 @@ class Imputer * strategy. * * @param input Input dataset to apply imputation. - * @param output - * @oaran targetValue - * @param mapper DatasetInfo object that holds informations about the dataset. - * @param dimension. - * @param transpose. + * @param output Armadillo matrix to save the results + * @oaran missingValue User defined missing value; it can be anything. + * @param dimension Dimension to apply the imputation. */ void Impute(const arma::Mat& input, arma::Mat& output, @@ -61,8 +63,8 @@ class Imputer } /** - * This overload of Impute() lets users to define custom value that - * can be replaced with the target value. + * This overload of Impute() lets users to define custom value that can be + * replaced with the target value. */ void Impute(const arma::Mat& input, arma::Mat& output, @@ -71,7 +73,12 @@ class Imputer const size_t dimension) { T mappedValue = static_cast(mapper.UnmapValue(missingValue, dimension)); - strategy.Apply(input, output, mappedValue, customValue, dimension, transpose); + strategy.Apply(input, + output, + mappedValue, + customValue, + dimension, + transpose); } //! Get the strategy diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 02f56b62b06..ed776c7a08c 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -17,7 +17,7 @@ #include #include -#include "old_boost_test_definitions.hpp" +#include "test_tools.hpp" using namespace mlpack; using namespace mlpack::data; From 87d8d46396a42a4cd491b32be4f17e8582c9223d Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Fri, 1 Jul 2016 17:48:28 +0900 Subject: [PATCH 17/40] update data::load to accept different mappertypes --- src/mlpack/core/data/dataset_info.hpp | 1 + src/mlpack/core/data/dataset_info_impl.hpp | 10 ++++++++- src/mlpack/core/data/load.hpp | 4 ++-- src/mlpack/core/data/load_arff.hpp | 4 ++-- src/mlpack/core/data/load_arff_impl.hpp | 6 ++--- src/mlpack/core/data/load_impl.hpp | 22 +++++++++---------- .../core/data/map_policies/missing_policy.hpp | 20 ++++++++++++----- .../preprocess/preprocess_imputer_main.cpp | 6 ++--- 8 files changed, 46 insertions(+), 27 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 8eea1c83aeb..bfd5b709027 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -37,6 +37,7 @@ class DatasetMapper */ DatasetMapper(const size_t dimensionality = 0); + DatasetMapper(MapPolicy policy, const size_t dimensionality = 0); /** * Given the string and the dimension to which it belongs, return its numeric * mapping. If no mapping yet exists, the string is added to the list of diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index b8e09f7f589..de543ab06d6 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -18,9 +18,17 @@ template inline DatasetMapper::DatasetMapper(const size_t dimensionality) : types(dimensionality, Datatype::numeric) { - // Nothing to initialize. + // Nothing to initialize here. } +template +inline DatasetMapper::DatasetMapper(MapPolicy policy, + const size_t dimensionality) : + types(dimensionality, Datatype::numeric), + policy(std::move(policy)) +{ + // Nothing to initialize here. +} // When we want to insert value into the map, // we could use the policy to map the string diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 19e238a9403..b2009d8b9fb 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -91,10 +91,10 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading. * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Mat& matrix, - DatasetInfo& info, + MapperType& info, const bool fatal = false, const bool transpose = true); diff --git a/src/mlpack/core/data/load_arff.hpp b/src/mlpack/core/data/load_arff.hpp index f04e38ab8bd..60579ca7b1b 100644 --- a/src/mlpack/core/data/load_arff.hpp +++ b/src/mlpack/core/data/load_arff.hpp @@ -42,10 +42,10 @@ void LoadARFF(const std::string& filename, arma::Mat& matrix); * @param info DatasetInfo object; can be default-constructed or pre-existing * from another call to LoadARFF(). */ -template +template void LoadARFF(const std::string& filename, arma::Mat& matrix, - DatasetInfo& info); + MapperType& info); } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 68c9184fe71..edb9057aea6 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -15,10 +15,10 @@ namespace mlpack { namespace data { -template +template void LoadARFF(const std::string& filename, arma::Mat& matrix, - DatasetInfo& info) + MapperType& info) { // First, open the file. std::ifstream ifs; @@ -98,7 +98,7 @@ void LoadARFF(const std::string& filename, // Reset the DatasetInfo object, if needed. if (info.Dimensionality() == 0) { - info = DatasetInfo(dimensionality); + info = MapperType(dimensionality); } else if (info.Dimensionality() != dimensionality) { diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 5479bab17d5..8349f4c6a34 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -59,13 +59,13 @@ void TransPoseTokens(std::vector> const &input, } } -template -void MapToNumerical(const std::vector &tokens, - size_t &row, - DatasetInfo &info, - arma::Mat &matrix) +template +void MapToNumerical(const std::vector& tokens, + size_t& row, + MapperType& info, + arma::Mat& matrix) { - auto notNumber = [](const std::string &str) + auto notNumber = [](const std::string& str) { eT val(0); std::stringstream token; @@ -370,10 +370,10 @@ bool Load(const std::string& filename, } // Load with mappings. Unfortunately we have to implement this ourselves. -template +template bool Load(const std::string& filename, arma::Mat& matrix, - DatasetInfo& info, + MapperType& info, const bool fatal, const bool transpose) { @@ -446,16 +446,16 @@ bool Load(const std::string& filename, if (transpose) { matrix.set_size(cols, rows); - info = DatasetInfo(cols); + info = MapperType(cols); } else { matrix.set_size(rows, cols); - info = DatasetInfo(rows); + info = MapperType(rows); } stream.close(); - stream.open(filename, std::fstream::in); + stream.open(filename, std::fstream::in); if(transpose) { diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index b7e063083be..4cc8a9647ec 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -26,19 +26,27 @@ class MissingPolicy public: typedef size_t mapped_type; + //explicit MissingPolicy(std::set specificString) : + //specificString(std::move(specificString)) + //{ + //// Nothing to initialize here. + //} + + template mapped_type MapString(MapType& maps, - std::vector& types, - const std::string& string, - const size_t dimension) + std::vector& types, + const std::string& string, + const size_t dimension) { // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. - if (maps.count(dimension) == 0 || + if (//specificString.count(string) != 0 && + maps.count(dimension) == 0 || maps[dimension].first.left.count(string) == 0) { - // This string does not exist yet. + // This string does not exist yet. size_t& numMappings = maps[dimension].second; typedef boost::bimap::value_type PairType; @@ -51,6 +59,8 @@ class MissingPolicy return maps[dimension].first.left.at(string); } } + private: + //std::set specificString; }; // class MissingPolicy } // namespace data diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index b833ab1833f..6a290b9dc9a 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -83,10 +84,10 @@ int main(int argc, char** argv) // DatasetInfo holds how the DatasetMapper should map the values. // can be specified by passing map_policy classes as template parameters // ex) DatasetMapper info; - using MapperType = DatasetMapper; + using MapperType = DatasetMapper; MapperType info; - Load(inputFile, input, info, true, true); + Load(inputFile, input, info, true, true); // for testing purpose Log::Info << input << endl; @@ -100,7 +101,6 @@ int main(int argc, char** argv) arma::Mat output(input); - Log::Info << "Performing '" << strategy << "' imputation strategy " << "on dimension '" << dimension << endl; From de0b2dbba2422296d801e7aa60dc2ed06091ae1a Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Fri, 1 Jul 2016 23:47:31 +0900 Subject: [PATCH 18/40] update data::load to accept different policies --- src/mlpack/core/data/dataset_info.hpp | 14 +- src/mlpack/core/data/dataset_info_impl.hpp | 44 +++-- src/mlpack/core/data/imputer.hpp | 1 + src/mlpack/core/data/load.hpp | 12 +- src/mlpack/core/data/load_arff.hpp | 4 +- src/mlpack/core/data/load_arff_impl.hpp | 6 +- src/mlpack/core/data/load_impl.hpp | 182 +++++++++++++++++- .../core/data/map_policies/missing_policy.hpp | 23 ++- .../preprocess/preprocess_imputer_main.cpp | 18 +- src/mlpack/tests/imputation_test.cpp | 8 +- 10 files changed, 250 insertions(+), 62 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index bfd5b709027..eaa68825de3 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -24,9 +24,9 @@ namespace data { * (Datatype::numeric or Datatype::categorical) as well as mappings from strings * to unsigned integers and vice versa. * - * @tparam MapPolicy Mapping policy used to specify MapString(); + * @tparam PolicyType Mapping policy used to specify MapString(); */ -template +template class DatasetMapper { public: @@ -37,7 +37,7 @@ class DatasetMapper */ DatasetMapper(const size_t dimensionality = 0); - DatasetMapper(MapPolicy policy, const size_t dimensionality = 0); + DatasetMapper(PolicyType& policy, const size_t dimensionality = 0); /** * Given the string and the dimension to which it belongs, return its numeric * mapping. If no mapping yet exists, the string is added to the list of @@ -47,7 +47,7 @@ class DatasetMapper * @param string String to find/create mapping for. * @param dimension Index of the dimension of the string. */ - typename MapPolicy::mapped_type MapString(const std::string& string, + typename PolicyType::mapped_type MapString(const std::string& string, const size_t dimension); /** @@ -69,7 +69,7 @@ class DatasetMapper * @param string Mapped string for value. * @param dimension Dimension to unmap string from. */ - typename MapPolicy::mapped_type UnmapValue(const std::string& string, + typename PolicyType::mapped_type UnmapValue(const std::string& string, const size_t dimension); //! Return the type of a given dimension (numeric or categorical). @@ -106,7 +106,7 @@ class DatasetMapper std::vector types; // BiMapType definition - using BiMapType = boost::bimap; + using BiMapType = boost::bimap; // Mappings from strings to integers. // Map entries will only exist for dimensions that are categorical. @@ -114,7 +114,7 @@ class DatasetMapper MapType maps; - MapPolicy policy; + PolicyType policy; }; // Use typedef to provide backward compatibility diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index de543ab06d6..93e2a13f4f1 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -2,7 +2,7 @@ * @file dataset_info_impl.hpp * @author Ryan Curtin * - * An implementation of the DatasetMapper class. + * An implementation of the DatasetMapper class. */ #ifndef MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP #define MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP @@ -14,26 +14,28 @@ namespace mlpack { namespace data { // Default constructor. -template -inline DatasetMapper::DatasetMapper(const size_t dimensionality) : +template +inline DatasetMapper::DatasetMapper(const size_t dimensionality) : types(dimensionality, Datatype::numeric) { + Log::Debug << "DatasetMapper(dimensionality)" << std::endl; // Nothing to initialize here. } -template -inline DatasetMapper::DatasetMapper(MapPolicy policy, +template +inline DatasetMapper::DatasetMapper(PolicyType& policy, const size_t dimensionality) : types(dimensionality, Datatype::numeric), policy(std::move(policy)) { + Log::Debug << "DatasetMapper(policy, dimensionality)" << std::endl; // Nothing to initialize here. } // When we want to insert value into the map, // we could use the policy to map the string -template -inline typename MapPolicy::mapped_type DatasetMapper::MapString( +template +inline typename PolicyType::mapped_type DatasetMapper::MapString( const std::string& string, const size_t dimension) { @@ -41,8 +43,8 @@ inline typename MapPolicy::mapped_type DatasetMapper::MapString( } // Return the string corresponding to a value in a given dimension. -template -inline const std::string& DatasetMapper::UnmapString( +template +inline const std::string& DatasetMapper::UnmapString( const size_t value, const size_t dimension) { @@ -50,7 +52,7 @@ inline const std::string& DatasetMapper::UnmapString( if (maps[dimension].first.right.count(value) == 0) { std::ostringstream oss; - oss << "DatasetMapper::UnmapString(): value '" << value + oss << "DatasetMapper::UnmapString(): value '" << value << "' unknown for dimension " << dimension; throw std::invalid_argument(oss.str()); } @@ -59,8 +61,8 @@ inline const std::string& DatasetMapper::UnmapString( } // Return the value corresponding to a string in a given dimension. -template -inline typename MapPolicy::mapped_type DatasetMapper::UnmapValue( +template +inline typename PolicyType::mapped_type DatasetMapper::UnmapValue( const std::string& string, const size_t dimension) { @@ -68,7 +70,7 @@ inline typename MapPolicy::mapped_type DatasetMapper::UnmapValue( if (maps[dimension].first.left.count(string) == 0) { std::ostringstream oss; - oss << "DatasetMapper::UnmapValue(): string '" << string + oss << "DatasetMapper::UnmapValue(): string '" << string << "' unknown for dimension " << dimension; throw std::invalid_argument(oss.str()); } @@ -77,8 +79,8 @@ inline typename MapPolicy::mapped_type DatasetMapper::UnmapValue( } // Get the type of a particular dimension. -template -inline Datatype DatasetMapper::Type(const size_t dimension) const +template +inline Datatype DatasetMapper::Type(const size_t dimension) const { if (dimension >= types.size()) { @@ -91,8 +93,8 @@ inline Datatype DatasetMapper::Type(const size_t dimension) const return types[dimension]; } -template -inline Datatype& DatasetMapper::Type(const size_t dimension) +template +inline Datatype& DatasetMapper::Type(const size_t dimension) { if (dimension >= types.size()) types.resize(dimension + 1, Datatype::numeric); @@ -100,15 +102,15 @@ inline Datatype& DatasetMapper::Type(const size_t dimension) return types[dimension]; } -template +template inline -size_t DatasetMapper::NumMappings(const size_t dimension) const +size_t DatasetMapper::NumMappings(const size_t dimension) const { return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second; } -template -inline size_t DatasetMapper::Dimensionality() const +template +inline size_t DatasetMapper::Dimensionality() const { return types.size(); } diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index b7f4bee5eea..45966392d63 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -9,6 +9,7 @@ #define MLPACK_CORE_DATA_IMPUTER_HPP #include +#include "dataset_info.hpp" namespace mlpack { namespace data { diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index b2009d8b9fb..476c3ad3664 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -91,10 +91,18 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading. * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Mat& matrix, - MapperType& info, + DatasetMapper& info, + const bool fatal = false, + const bool transpose = true); + +template +bool Load(const std::string& filename, + arma::Mat& matrix, + DatasetMapper& info, + PolicyType& policy, const bool fatal = false, const bool transpose = true); diff --git a/src/mlpack/core/data/load_arff.hpp b/src/mlpack/core/data/load_arff.hpp index 60579ca7b1b..ff6c4315920 100644 --- a/src/mlpack/core/data/load_arff.hpp +++ b/src/mlpack/core/data/load_arff.hpp @@ -42,10 +42,10 @@ void LoadARFF(const std::string& filename, arma::Mat& matrix); * @param info DatasetInfo object; can be default-constructed or pre-existing * from another call to LoadARFF(). */ -template +template void LoadARFF(const std::string& filename, arma::Mat& matrix, - MapperType& info); + DatasetMapper& info); } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index edb9057aea6..71ccea64a86 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -15,10 +15,10 @@ namespace mlpack { namespace data { -template +template void LoadARFF(const std::string& filename, arma::Mat& matrix, - MapperType& info) + DatasetMapper& info) { // First, open the file. std::ifstream ifs; @@ -98,7 +98,7 @@ void LoadARFF(const std::string& filename, // Reset the DatasetInfo object, if needed. if (info.Dimensionality() == 0) { - info = MapperType(dimensionality); + info = DatasetMapper(dimensionality); } else if (info.Dimensionality() != dimensionality) { diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 8349f4c6a34..f1e7651e0db 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -59,10 +59,10 @@ void TransPoseTokens(std::vector> const &input, } } -template +template void MapToNumerical(const std::vector& tokens, size_t& row, - MapperType& info, + DatasetMapper& info, arma::Mat& matrix) { auto notNumber = [](const std::string& str) @@ -370,10 +370,180 @@ bool Load(const std::string& filename, } // Load with mappings. Unfortunately we have to implement this ourselves. -template +template bool Load(const std::string& filename, arma::Mat& matrix, - MapperType& info, + DatasetMapper& info, + PolicyType& policy, + const bool fatal, + const bool transpose) +{ + // Get the extension and load as necessary. + Timer::Start("loading_data"); + Log::Debug << "Load with Policy" << std::endl; + // Get the extension. + std::string extension = Extension(filename); + + // Catch nonexistent files by opening the stream ourselves. + std::fstream stream; + stream.open(filename.c_str(), std::fstream::in); + + if (!stream.is_open()) + { + Timer::Stop("loading_data"); + if (fatal) + Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl; + else + Log::Warn << "Cannot open file '" << filename << "'; load failed." + << std::endl; + + return false; + } + + if (extension == "csv" || extension == "tsv" || extension == "txt") + { + // True if we're looking for commas; if false, we're looking for spaces. + bool commas = (extension == "csv"); + + std::string type; + if (extension == "csv") + type = "CSV data"; + else + type = "raw ASCII-formatted data"; + + Log::Info << "Loading '" << filename << "' as " << type << ". " + << std::flush; + std::string separators; + if (commas) + separators = ","; + else + separators = " \t"; + + // We'll load this as CSV (or CSV with spaces or tabs) according to + // RFC4180. So the first thing to do is determine the size of the matrix. + std::string buffer; + size_t cols = 0; + + std::getline(stream, buffer, '\n'); + // Count commas and whitespace in the line, ignoring anything inside + // quotes. + typedef boost::tokenizer> Tokenizer; + boost::escaped_list_separator sep("\\", separators, "\""); + Tokenizer tok(buffer, sep); + for (Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i) + ++cols; + + // Now count the number of lines in the file. We've already counted the + // first one. + size_t rows = 1; + while (!stream.eof() && !stream.bad() && !stream.fail()) + { + std::getline(stream, buffer, '\n'); + if (!stream.fail()) + ++rows; + } + + // Now we have the size. So resize our matrix. + if (transpose) + { + matrix.set_size(cols, rows); + Log::Debug << "initialize datasetmapper with policy" << std::endl; + info = DatasetMapper(policy, cols); + } + else + { + matrix.set_size(rows, cols); + Log::Debug << "initialize datasetmapper with policy" << std::endl; + info = DatasetMapper(policy, rows); + } + + stream.close(); + stream.open(filename, std::fstream::in); + + if(transpose) + { + std::vector> tokensArray; + std::vector tokens; + while (!stream.bad() && !stream.fail() && !stream.eof()) + { + // Extract line by line. + std::getline(stream, buffer, '\n'); + Tokenizer lineTok(buffer, sep); + tokens = details::ToTokens(lineTok); + if(tokens.size() == cols) + { + tokensArray.emplace_back(std::move(tokens)); + } + } + for(size_t i = 0; i != cols; ++i) + { + details::TransPoseTokens(tokensArray, tokens, i); + details::MapToNumerical(tokens, i, + info, matrix); + } + } + else + { + size_t row = 0; + while (!stream.bad() && !stream.fail() && !stream.eof()) + { + // Extract line by line. + std::getline(stream, buffer, '\n'); + Tokenizer lineTok(buffer, sep); + details::MapToNumerical(details::ToTokens(lineTok), row, + info, matrix); + ++row; + } + } + } + else if (extension == "arff") + { + Log::Info << "Loading '" << filename << "' as ARFF dataset. " + << std::flush; + try + { + LoadARFF(filename, matrix, info); + + // We transpose by default. So, un-transpose if necessary... + if (!transpose) + inplace_transpose(matrix); + } + catch (std::exception& e) + { + if (fatal) + Log::Fatal << e.what() << std::endl; + else + Log::Warn << e.what() << std::endl; + } + } + else + { + // The type is unknown. + Timer::Stop("loading_data"); + if (fatal) + Log::Fatal << "Unable to detect type of '" << filename << "'; " + << "incorrect extension?" << std::endl; + else + Log::Warn << "Unable to detect type of '" << filename << "'; load failed." + << " Incorrect extension?" << std::endl; + + return false; + } + + Log::Info << "Size is " << (transpose ? matrix.n_cols : matrix.n_rows) + << " x " << (transpose ? matrix.n_rows : matrix.n_cols) << ".\n"; + + Timer::Stop("loading_data"); + + return true; +} + + +// Load with mappings. Unfortunately we have to implement this ourselves. +template +bool Load(const std::string& filename, + arma::Mat& matrix, + DatasetMapper& info, const bool fatal, const bool transpose) { @@ -446,12 +616,12 @@ bool Load(const std::string& filename, if (transpose) { matrix.set_size(cols, rows); - info = MapperType(cols); + info = DatasetMapper(cols); } else { matrix.set_size(rows, cols); - info = MapperType(rows); + info = DatasetMapper(rows); } stream.close(); diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index 4cc8a9647ec..2611e17ef13 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -26,11 +26,18 @@ class MissingPolicy public: typedef size_t mapped_type; - //explicit MissingPolicy(std::set specificString) : - //specificString(std::move(specificString)) - //{ - //// Nothing to initialize here. - //} + MissingPolicy() + { + Log::Debug << "MissingPolicy()" << std::endl; + missingSet.insert("a"); + } + + explicit MissingPolicy(std::set missingSet) : + missingSet(std::move(missingSet)) + { + Log::Debug << "MissingPolicy()" << std::endl; + // Nothing to initialize here. + } template @@ -42,11 +49,11 @@ class MissingPolicy // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. - if (//specificString.count(string) != 0 && + if (missingSet.count(string) != 0 && maps.count(dimension) == 0 || maps[dimension].first.left.count(string) == 0) { - // This string does not exist yet. + // This string does not exist yet. size_t& numMappings = maps[dimension].second; typedef boost::bimap::value_type PairType; @@ -60,7 +67,7 @@ class MissingPolicy } } private: - //std::set specificString; + std::set missingSet; }; // class MissingPolicy } // namespace data diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 6a290b9dc9a..687e78ee263 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -84,10 +84,12 @@ int main(int argc, char** argv) // DatasetInfo holds how the DatasetMapper should map the values. // can be specified by passing map_policy classes as template parameters // ex) DatasetMapper info; - using MapperType = DatasetMapper; - MapperType info; + std::set missingSet; + missingSet.insert(missingValue); + MissingPolicy policy(missingSet); + DatasetMapper info(policy); - Load(inputFile, input, info, true, true); + Load(inputFile, input, info, policy, true, true); // for testing purpose Log::Info << input << endl; @@ -109,7 +111,7 @@ int main(int argc, char** argv) { Log::Info << "Replacing all '" << missingValue << "' with '" << customValue << "'." << endl; - Imputer> impu(info); + Imputer> impu(info); impu.Impute(input, output, missingValue, customValue, dimension); } else @@ -119,17 +121,17 @@ int main(int argc, char** argv) if (strategy == "mean") { - Imputer> impu(info); + Imputer> impu(info); impu.Impute(input, output, missingValue, dimension); } else if (strategy == "median") { - Imputer> impu(info); + Imputer> impu(info); impu.Impute(input, output, missingValue, dimension); } else if (strategy == "listwise") { - Imputer> impu(info); + Imputer> impu(info); impu.Impute(input, output, missingValue, dimension); } else @@ -139,8 +141,6 @@ int main(int argc, char** argv) } // for testing purpose - Log::Info << "input::" << endl; - Log::Info << input << endl; Log::Info << "output::" << endl; Log::Info << output << endl; diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index ed776c7a08c..8c759bdd419 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -51,10 +51,10 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) /* TODO: Connect Load with the new DatasetMapper instead of DatasetInfo*/ - Imputer> impu(info); - impu.Impute(input, output, missingValue, customValue, feature); + //Imputer> impu(info); + //impu.Impute(input, output, missingValue, customValue, feature); // Remove the file. remove("test_file.csv"); } From bc187cab7bb3d8847a2bd60343eee8eb7719118e Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Sat, 2 Jul 2016 02:14:34 +0900 Subject: [PATCH 19/40] add imputer doc --- src/mlpack/core/data/load.hpp | 46 ++++++++++++++++--- src/mlpack/core/data/load_impl.hpp | 2 +- .../core/data/map_policies/missing_policy.hpp | 1 - .../preprocess/preprocess_imputer_main.cpp | 33 ++++++++----- 4 files changed, 62 insertions(+), 20 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 476c3ad3664..da770b4e8cd 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -61,10 +61,10 @@ bool Load(const std::string& filename, /** * Loads a matrix from a file, guessing the filetype from the extension and - * mapping categorical features with a DatasetInfo object. This will transpose - * the matrix (unless the transpose parameter is set to false). This particular - * overload of Load() can only load text-based formats, such as those given - * below: + * mapping categorical features with a DatasetMapper object. This will + * transpose the matrix (unless the transpose parameter is set to false). + * This particular overload of Load() can only load text-based formats, such as + * those given below: * * - CSV (csv_ascii), denoted by .csv, or optionally .txt * - TSV (raw_ascii), denoted by .tsv, .csv, or .txt @@ -81,12 +81,12 @@ bool Load(const std::string& filename, * mlpack requires column-major matrices, this should be left at its default * value of 'true'. * - * The DatasetInfo object passed to this function will be re-created, so any + * The DatasetMapper object passed to this function will be re-created, so any * mappings from previous loads will be lost. * * @param filename Name of file to load. * @param matrix Matrix to load contents of file into. - * @param info DatasetInfo object to populate with mappings and data types. + * @param info DatasetMapper object to populate with mappings and data types. * @param fatal If an error should be reported as fatal (default false). * @param transpose If true, transpose the matrix after loading. * @return Boolean value indicating success or failure of load. @@ -98,6 +98,40 @@ bool Load(const std::string& filename, const bool fatal = false, const bool transpose = true); +/** + * Loads a matrix from a file, guessing the filetype from the extension and + * mapping categorical features with a DatasetMapper object. This will + * transpose the matrix (unless the transpose parameter is set to false). + * This particular overload of Load() can only load text-based formats, such as + * those given below: + * + * - CSV (csv_ascii), denoted by .csv, or optionally .txt + * - TSV (raw_ascii), denoted by .tsv, .csv, or .txt + * - ASCII (raw_ascii), denoted by .txt + * + * If the file extension is not one of those types, an error will be given. + * This is preferable to Armadillo's default behavior of loading an unknown + * filetype as raw_binary, which can have very confusing effects. + * + * If the parameter 'fatal' is set to true, a std::runtime_error exception will + * be thrown if the matrix does not load successfully. The parameter + * 'transpose' controls whether or not the matrix is transposed after loading. + * In most cases, because data is generally stored in a row-major format and + * mlpack requires column-major matrices, this should be left at its default + * value of 'true'. + * + * The DatasetMapper object passed to this function will be re-created, so any + * mappings from previous loads will be lost. policy is passed to the + * constructor of DatasetMapper to create a new instance. + * + * @param filename Name of file to load. + * @param matrix Matrix to load contents of file into. + * @param info DatasetMapper object to populate with mappings and data types. + * @param policy Policy class that decides how the DatasetMapper should map. + * @param fatal If an error should be reported as fatal (default false). + * @param transpose If true, transpose the matrix after loading. + * @return Boolean value indicating success or failure of load. + */ template bool Load(const std::string& filename, arma::Mat& matrix, diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index f1e7651e0db..4f03221d094 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -369,7 +369,7 @@ bool Load(const std::string& filename, return success; } -// Load with mappings. Unfortunately we have to implement this ourselves. +// Load with mappings and policy. template bool Load(const std::string& filename, arma::Mat& matrix, diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index 2611e17ef13..a38d877a8fc 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -29,7 +29,6 @@ class MissingPolicy MissingPolicy() { Log::Debug << "MissingPolicy()" << std::endl; - missingSet.insert("a"); } explicit MissingPolicy(std::set missingSet) : diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 687e78ee263..73344071e99 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -15,14 +15,23 @@ #include #include -PROGRAM_INFO("Imputer", "This " - "utility takes an any type of data and provides " - "imputation strategies for missing data."); +PROGRAM_INFO("Impute Data", "This utility takes a dataset and converts user " + "defined missing variable to another to provide more meaningful analysis " + "\n\n" + "The program does not modify the original file, but instead makes a " + "separate file to save the output data; The program requires you to " + "specify the file name with --output_file (-o)." + "\n\n" + "For example, if we consider 'NULL' in dimension 0 to be a missing " + "variable and want to delete whole row containing the NULL in the " + "column-wise dataset, and save the result to result.csv, we could run" + "\n\n" + "$ mlpack_preprocess_imputer -i dataset.csv -o result.csv -m NULL -d 0 \n" + "> -s listwise_deletion") PARAM_STRING_REQ("input_file", "File containing data,", "i"); PARAM_STRING("output_file", "File to save output", "o", ""); PARAM_STRING("missing_value", "User defined missing value", "m", "") -PARAM_STRING("map_policy", "mapping policy to be used while loading", "p", "") PARAM_STRING("strategy", "imputation strategy to be applied", "s", "") PARAM_DOUBLE("custom_value", "user_defined custom value", "c", 0.0) PARAM_INT("dimension", "the dimension to apply imputation", "d", 0); @@ -40,7 +49,6 @@ int main(int argc, char** argv) const string inputFile = CLI::GetParam("input_file"); const string outputFile = CLI::GetParam("output_file"); const string missingValue = CLI::GetParam("missing_value"); - const string mapPolicy = CLI::GetParam("map_policy"); const double customValue = CLI::GetParam("custom_value"); const size_t dimension = (size_t) CLI::GetParam("dimension"); string strategy = CLI::GetParam("strategy"); @@ -81,12 +89,13 @@ int main(int argc, char** argv) << "'custom' strategy" << endl; arma::mat input; - // DatasetInfo holds how the DatasetMapper should map the values. + // Policy tells how the DatasetMapper should map the values. // can be specified by passing map_policy classes as template parameters // ex) DatasetMapper info; std::set missingSet; missingSet.insert(missingValue); MissingPolicy policy(missingSet); + using MapperType = DatasetMapper; DatasetMapper info(policy); Load(inputFile, input, info, policy, true, true); @@ -104,14 +113,14 @@ int main(int argc, char** argv) arma::Mat output(input); Log::Info << "Performing '" << strategy << "' imputation strategy " - << "on dimension '" << dimension << endl; + << "on dimension '" << dimension << "'." << endl; // custom strategy only if (strategy == "custom") { Log::Info << "Replacing all '" << missingValue << "' with '" << customValue << "'." << endl; - Imputer> impu(info); + Imputer> impu(info); impu.Impute(input, output, missingValue, customValue, dimension); } else @@ -121,17 +130,17 @@ int main(int argc, char** argv) if (strategy == "mean") { - Imputer> impu(info); + Imputer> impu(info); impu.Impute(input, output, missingValue, dimension); } else if (strategy == "median") { - Imputer> impu(info); + Imputer> impu(info); impu.Impute(input, output, missingValue, dimension); } - else if (strategy == "listwise") + else if (strategy == "listwise_deletion") { - Imputer> impu(info); + Imputer> impu(info); impu.Impute(input, output, missingValue, dimension); } else From a340f69411b6a6d16106023e4bbcceb9d4688322 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Sat, 2 Jul 2016 09:00:40 +0900 Subject: [PATCH 20/40] debug median imputation and listwise deletion --- .../core/data/imputation_methods/listwise_deletion.hpp | 7 +++++-- .../core/data/imputation_methods/median_imputation.hpp | 4 ++-- src/mlpack/core/data/map_policies/missing_policy.hpp | 2 +- .../methods/preprocess/preprocess_imputer_main.cpp | 4 ++-- src/mlpack/tests/imputation_test.cpp | 10 +++++----- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp index a2de05d161c..f957a85bdbc 100644 --- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp +++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp @@ -30,6 +30,7 @@ class ListwiseDeletion { // initiate output output = input; + size_t count = 0; if (transpose) { @@ -37,7 +38,8 @@ class ListwiseDeletion { if (input(dimension, i) == mappedValue) { - output.shed_row(i); + output.shed_col(i - count); + count++; } } } @@ -47,7 +49,8 @@ class ListwiseDeletion { if (input(i, dimension) == mappedValue) { - output.shed_col(dimension); + output.shed_row(i - count); + count++; } } } diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp index 0a5910358ac..05eff340d70 100644 --- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -38,7 +38,7 @@ class MedianImputation { if (input(dimension, i) == mappedValue) { - output(dimension, i) = medianMat(0, i); + output(dimension, i) = medianMat(dimension, 0); } } } @@ -49,7 +49,7 @@ class MedianImputation { if (input(i, dimension) == mappedValue) { - output(i, dimension) = medianMat(i, 0); + output(i, dimension) = medianMat(0, dimension); } } } diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index a38d877a8fc..970a0ee797b 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -34,7 +34,7 @@ class MissingPolicy explicit MissingPolicy(std::set missingSet) : missingSet(std::move(missingSet)) { - Log::Debug << "MissingPolicy()" << std::endl; + Log::Debug << "MissingPolicy(missingSet)" << std::endl; // Nothing to initialize here. } diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 73344071e99..015ad96cd7a 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -90,12 +90,12 @@ int main(int argc, char** argv) arma::mat input; // Policy tells how the DatasetMapper should map the values. - // can be specified by passing map_policy classes as template parameters - // ex) DatasetMapper info; std::set missingSet; missingSet.insert(missingValue); + Log::Debug << "initalize MissingPolicy(missingSet)" << endl; MissingPolicy policy(missingSet); using MapperType = DatasetMapper; + Log::Debug << "initalize info(policy)" << endl; DatasetMapper info(policy); Load(inputFile, input, info, policy, true, true); diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 8c759bdd419..6abbe1da697 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -91,7 +91,7 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); // not transposed - imputer.Apply(input, output, mappedValue, customValue, 1/*dimension*/, false); + imputer.Apply(input, output, mappedValue, customValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 1), 99.0, 1e-5); @@ -122,7 +122,7 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest) MeanImputation imputer; // transposed - imputer.Apply(input, outputT, mappedValue, 0/*dimension*/, true); + imputer.Apply(input, outputT, mappedValue, 0, true); BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(0, 1), 2.5, 1e-5); @@ -138,7 +138,7 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest) BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); // not transposed - imputer.Apply(input, output, mappedValue, 1/*dimension*/, false); + imputer.Apply(input, output, mappedValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 1), 7.0, 1e-5); @@ -169,7 +169,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) MedianImputation imputer; // transposed - imputer.Apply(input, outputT, mappedValue, 1/*dimension*/, true); + imputer.Apply(input, outputT, mappedValue, 1, true); BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(0, 1), 0.0, 1e-5); @@ -185,7 +185,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); // not transposed - imputer.Apply(input, output, mappedValue, 1/*dimension*/, false); + imputer.Apply(input, output, mappedValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 1), 6.0, 1e-5); From 21d94c04652e8faadd5e8991103a3b73b4c81033 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Sun, 3 Jul 2016 03:01:54 +0900 Subject: [PATCH 21/40] remove duplicate code in load function --- src/mlpack/core/data/dataset_info.hpp | 6 +- src/mlpack/core/data/dataset_info_impl.hpp | 6 + src/mlpack/core/data/load.hpp | 6 +- src/mlpack/core/data/load_impl.hpp | 167 --------------------- 4 files changed, 15 insertions(+), 170 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index eaa68825de3..91e150bed3d 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -35,9 +35,9 @@ class DatasetMapper * the dimensionality cannot be changed later; you will have to create a new * DatasetMapper object. */ - DatasetMapper(const size_t dimensionality = 0); + explicit DatasetMapper(const size_t dimensionality = 0); - DatasetMapper(PolicyType& policy, const size_t dimensionality = 0); + explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0); /** * Given the string and the dimension to which it belongs, return its numeric * mapping. If no mapping yet exists, the string is added to the list of @@ -101,6 +101,8 @@ class DatasetMapper ar & data::CreateNVP(maps, "maps"); } + PolicyType& Policy() const; + private: //! Types of each dimension. std::vector types; diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index 93e2a13f4f1..c95fa1aac81 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -115,6 +115,12 @@ inline size_t DatasetMapper::Dimensionality() const return types.size(); } +template +inline PolicyType& DatasetMapper::Policy() const +{ + return this->policy; +} + } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index da770b4e8cd..8694cc2f045 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -96,7 +96,11 @@ bool Load(const std::string& filename, arma::Mat& matrix, DatasetMapper& info, const bool fatal = false, - const bool transpose = true); + const bool transpose = true) +{ + PolicyType policy; + return Load(filename, matrix, info, policy, fatal, transpose); +} /** * Loads a matrix from a file, guessing the filetype from the extension and diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 4f03221d094..c44b77f0b0d 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -538,173 +538,6 @@ bool Load(const std::string& filename, return true; } - -// Load with mappings. Unfortunately we have to implement this ourselves. -template -bool Load(const std::string& filename, - arma::Mat& matrix, - DatasetMapper& info, - const bool fatal, - const bool transpose) -{ - // Get the extension and load as necessary. - Timer::Start("loading_data"); - - // Get the extension. - std::string extension = Extension(filename); - - // Catch nonexistent files by opening the stream ourselves. - std::fstream stream; - stream.open(filename.c_str(), std::fstream::in); - - if (!stream.is_open()) - { - Timer::Stop("loading_data"); - if (fatal) - Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl; - else - Log::Warn << "Cannot open file '" << filename << "'; load failed." - << std::endl; - - return false; - } - - if (extension == "csv" || extension == "tsv" || extension == "txt") - { - // True if we're looking for commas; if false, we're looking for spaces. - bool commas = (extension == "csv"); - - std::string type; - if (extension == "csv") - type = "CSV data"; - else - type = "raw ASCII-formatted data"; - - Log::Info << "Loading '" << filename << "' as " << type << ". " - << std::flush; - std::string separators; - if (commas) - separators = ","; - else - separators = " \t"; - - // We'll load this as CSV (or CSV with spaces or tabs) according to - // RFC4180. So the first thing to do is determine the size of the matrix. - std::string buffer; - size_t cols = 0; - - std::getline(stream, buffer, '\n'); - // Count commas and whitespace in the line, ignoring anything inside - // quotes. - typedef boost::tokenizer> Tokenizer; - boost::escaped_list_separator sep("\\", separators, "\""); - Tokenizer tok(buffer, sep); - for (Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i) - ++cols; - - // Now count the number of lines in the file. We've already counted the - // first one. - size_t rows = 1; - while (!stream.eof() && !stream.bad() && !stream.fail()) - { - std::getline(stream, buffer, '\n'); - if (!stream.fail()) - ++rows; - } - - // Now we have the size. So resize our matrix. - if (transpose) - { - matrix.set_size(cols, rows); - info = DatasetMapper(cols); - } - else - { - matrix.set_size(rows, cols); - info = DatasetMapper(rows); - } - - stream.close(); - stream.open(filename, std::fstream::in); - - if(transpose) - { - std::vector> tokensArray; - std::vector tokens; - while (!stream.bad() && !stream.fail() && !stream.eof()) - { - // Extract line by line. - std::getline(stream, buffer, '\n'); - Tokenizer lineTok(buffer, sep); - tokens = details::ToTokens(lineTok); - if(tokens.size() == cols) - { - tokensArray.emplace_back(std::move(tokens)); - } - } - for(size_t i = 0; i != cols; ++i) - { - details::TransPoseTokens(tokensArray, tokens, i); - details::MapToNumerical(tokens, i, - info, matrix); - } - } - else - { - size_t row = 0; - while (!stream.bad() && !stream.fail() && !stream.eof()) - { - // Extract line by line. - std::getline(stream, buffer, '\n'); - Tokenizer lineTok(buffer, sep); - details::MapToNumerical(details::ToTokens(lineTok), row, - info, matrix); - ++row; - } - } - } - else if (extension == "arff") - { - Log::Info << "Loading '" << filename << "' as ARFF dataset. " - << std::flush; - try - { - LoadARFF(filename, matrix, info); - - // We transpose by default. So, un-transpose if necessary... - if (!transpose) - inplace_transpose(matrix); - } - catch (std::exception& e) - { - if (fatal) - Log::Fatal << e.what() << std::endl; - else - Log::Warn << e.what() << std::endl; - } - } - else - { - // The type is unknown. - Timer::Stop("loading_data"); - if (fatal) - Log::Fatal << "Unable to detect type of '" << filename << "'; " - << "incorrect extension?" << std::endl; - else - Log::Warn << "Unable to detect type of '" << filename << "'; load failed." - << " Incorrect extension?" << std::endl; - - return false; - } - - Log::Info << "Size is " << (transpose ? matrix.n_cols : matrix.n_rows) - << " x " << (transpose ? matrix.n_rows : matrix.n_cols) << ".\n"; - - Timer::Stop("loading_data"); - - return true; -} - // Load a model from file. template bool Load(const std::string& filename, From a92afaaafb1af3deede31c1a5ef0b508bfbfe105 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 4 Jul 2016 07:07:29 +0900 Subject: [PATCH 22/40] delete load overload --- src/mlpack/core/data/dataset_info.hpp | 10 ++++ src/mlpack/core/data/dataset_info_impl.hpp | 7 +++ src/mlpack/core/data/load.hpp | 46 ------------------- src/mlpack/core/data/load_impl.hpp | 15 +++--- .../data/map_policies/increment_policy.hpp | 3 +- .../core/data/map_policies/missing_policy.hpp | 9 ++-- .../preprocess/preprocess_imputer_main.cpp | 2 +- src/mlpack/tests/imputation_test.cpp | 28 +++++++---- 8 files changed, 52 insertions(+), 68 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 91e150bed3d..da9f3cef479 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -37,7 +37,13 @@ class DatasetMapper */ explicit DatasetMapper(const size_t dimensionality = 0); + /** + * Create the DatasetMapper object with the given policy and dimensionality. + * Note that the dimensionality cannot be changed later; you will have to + * create a new DatasetMapper object. Policy can be modified by the modifier. + */ explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0); + /** * Given the string and the dimension to which it belongs, return its numeric * mapping. If no mapping yet exists, the string is added to the list of @@ -101,8 +107,12 @@ class DatasetMapper ar & data::CreateNVP(maps, "maps"); } + //! Return the policy of the mapper. PolicyType& Policy() const; + //! Modify the policy of the mapper (be careful!). + PolicyType& Policy(); + private: //! Types of each dimension. std::vector types; diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index c95fa1aac81..4eed4a99183 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -121,6 +121,13 @@ inline PolicyType& DatasetMapper::Policy() const return this->policy; } +template +inline PolicyType& DatasetMapper::Policy() +{ + return this->policy; +} + + } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 8694cc2f045..4b5debe1275 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -96,52 +96,6 @@ bool Load(const std::string& filename, arma::Mat& matrix, DatasetMapper& info, const bool fatal = false, - const bool transpose = true) -{ - PolicyType policy; - return Load(filename, matrix, info, policy, fatal, transpose); -} - -/** - * Loads a matrix from a file, guessing the filetype from the extension and - * mapping categorical features with a DatasetMapper object. This will - * transpose the matrix (unless the transpose parameter is set to false). - * This particular overload of Load() can only load text-based formats, such as - * those given below: - * - * - CSV (csv_ascii), denoted by .csv, or optionally .txt - * - TSV (raw_ascii), denoted by .tsv, .csv, or .txt - * - ASCII (raw_ascii), denoted by .txt - * - * If the file extension is not one of those types, an error will be given. - * This is preferable to Armadillo's default behavior of loading an unknown - * filetype as raw_binary, which can have very confusing effects. - * - * If the parameter 'fatal' is set to true, a std::runtime_error exception will - * be thrown if the matrix does not load successfully. The parameter - * 'transpose' controls whether or not the matrix is transposed after loading. - * In most cases, because data is generally stored in a row-major format and - * mlpack requires column-major matrices, this should be left at its default - * value of 'true'. - * - * The DatasetMapper object passed to this function will be re-created, so any - * mappings from previous loads will be lost. policy is passed to the - * constructor of DatasetMapper to create a new instance. - * - * @param filename Name of file to load. - * @param matrix Matrix to load contents of file into. - * @param info DatasetMapper object to populate with mappings and data types. - * @param policy Policy class that decides how the DatasetMapper should map. - * @param fatal If an error should be reported as fatal (default false). - * @param transpose If true, transpose the matrix after loading. - * @return Boolean value indicating success or failure of load. - */ -template -bool Load(const std::string& filename, - arma::Mat& matrix, - DatasetMapper& info, - PolicyType& policy, - const bool fatal = false, const bool transpose = true); /** diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index c44b77f0b0d..419b0902cfd 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -369,18 +369,17 @@ bool Load(const std::string& filename, return success; } -// Load with mappings and policy. +// Load with mappings. Unfortunately we have to implement this ourselves. template bool Load(const std::string& filename, arma::Mat& matrix, DatasetMapper& info, - PolicyType& policy, const bool fatal, const bool transpose) { // Get the extension and load as necessary. Timer::Start("loading_data"); - Log::Debug << "Load with Policy" << std::endl; + // Get the extension. std::string extension = Extension(filename); @@ -412,7 +411,7 @@ bool Load(const std::string& filename, type = "raw ASCII-formatted data"; Log::Info << "Loading '" << filename << "' as " << type << ". " - << std::flush; + << std::endl; std::string separators; if (commas) separators = ","; @@ -447,14 +446,12 @@ bool Load(const std::string& filename, if (transpose) { matrix.set_size(cols, rows); - Log::Debug << "initialize datasetmapper with policy" << std::endl; - info = DatasetMapper(policy, cols); + info = DatasetMapper(info.Policy(), cols); } else { matrix.set_size(rows, cols); - Log::Debug << "initialize datasetmapper with policy" << std::endl; - info = DatasetMapper(policy, rows); + info = DatasetMapper(info.Policy(), rows); } stream.close(); @@ -499,7 +496,7 @@ bool Load(const std::string& filename, else if (extension == "arff") { Log::Info << "Loading '" << filename << "' as ARFF dataset. " - << std::flush; + << std::endl; try { LoadARFF(filename, matrix, info); diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp index f0b1d7094f1..d4b104b5285 100644 --- a/src/mlpack/core/data/map_policies/increment_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_policy.hpp @@ -24,7 +24,8 @@ namespace data { class IncrementPolicy { public: - typedef size_t mapped_type; + // typedef of mapped_type + using mapped_type = size_t; template mapped_type MapString(MapType& maps, diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index 970a0ee797b..6c3d1d11174 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -24,7 +24,8 @@ namespace data { class MissingPolicy { public: - typedef size_t mapped_type; + // typedef of mapped_type + using mapped_type = size_t; MissingPolicy() { @@ -48,9 +49,10 @@ class MissingPolicy // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. + Log::Debug << "missingSet has: " << missingSet.count(string) << std::endl; if (missingSet.count(string) != 0 && - maps.count(dimension) == 0 || - maps[dimension].first.left.count(string) == 0) + (maps.count(dimension) == 0 || + maps[dimension].first.left.count(string) == 0)) { // This string does not exist yet. size_t& numMappings = maps[dimension].second; @@ -62,6 +64,7 @@ class MissingPolicy else { // This string already exists in the mapping. + Log::Debug << "string already exists in the mapping" << std::endl; return maps[dimension].first.left.at(string); } } diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 015ad96cd7a..6857352a37b 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -98,7 +98,7 @@ int main(int argc, char** argv) Log::Debug << "initalize info(policy)" << endl; DatasetMapper info(policy); - Load(inputFile, input, info, policy, true, true); + Load(inputFile, input, info, true, true); // for testing purpose Log::Info << input << endl; diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 6abbe1da697..87469452222 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -39,11 +39,12 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) arma::mat input; arma::mat output; - string missingValue = "a"; - double customValue = 99; - size_t feature = 0; + size_t dimension = 0; - DatasetInfo info; + std::set mset; + mset.insert("a"); + MissingPolicy miss(mset); + DatasetMapper info(miss); BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true); BOOST_REQUIRE_EQUAL(input.n_rows, 3); @@ -51,10 +52,21 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) /* TODO: Connect Load with the new DatasetMapper instead of DatasetInfo*/ - //Imputer> impu(info); - //impu.Impute(input, output, missingValue, customValue, feature); + Imputer, + CustomImputation> imputer(info); + imputer.Impute(input, output, "a", 99, dimension); // convert a -> 99 + + BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 1), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 2), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 2), 7.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 0), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 1), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5); + // Remove the file. remove("test_file.csv"); } From bace8b25ba703878a1348782e9e4feb210062a47 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 4 Jul 2016 09:21:30 +0900 Subject: [PATCH 23/40] modify MapToNumerical to work with MissingPolicy --- src/mlpack/core/data/dataset_info.hpp | 3 ++ src/mlpack/core/data/dataset_info_impl.hpp | 9 +++-- src/mlpack/core/data/load_impl.hpp | 33 +++++-------------- .../core/data/map_policies/missing_policy.hpp | 5 +-- .../preprocess/preprocess_imputer_main.cpp | 5 --- src/mlpack/tests/imputation_test.cpp | 16 +++++---- 6 files changed, 28 insertions(+), 43 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index da9f3cef479..d87e027dd85 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -113,6 +113,9 @@ class DatasetMapper //! Modify the policy of the mapper (be careful!). PolicyType& Policy(); + //! Modify (Replace) the policy of the mapper with a new policy + void Policy(PolicyType& policy); + private: //! Types of each dimension. std::vector types; diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index 4eed4a99183..1c350974ae8 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -18,7 +18,6 @@ template inline DatasetMapper::DatasetMapper(const size_t dimensionality) : types(dimensionality, Datatype::numeric) { - Log::Debug << "DatasetMapper(dimensionality)" << std::endl; // Nothing to initialize here. } @@ -28,7 +27,6 @@ inline DatasetMapper::DatasetMapper(PolicyType& policy, types(dimensionality, Datatype::numeric), policy(std::move(policy)) { - Log::Debug << "DatasetMapper(policy, dimensionality)" << std::endl; // Nothing to initialize here. } @@ -127,6 +125,13 @@ inline PolicyType& DatasetMapper::Policy() return this->policy; } +template +inline void DatasetMapper::Policy(PolicyType& policy) +{ + this->policy = std::move(policy); +} + + } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 419b0902cfd..f521be4ca00 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -65,34 +65,17 @@ void MapToNumerical(const std::vector& tokens, DatasetMapper& info, arma::Mat& matrix) { - auto notNumber = [](const std::string& str) + std::stringstream token; + for (size_t i = 0; i != tokens.size(); ++i) { - eT val(0); - std::stringstream token; - token.str(str); - token>>val; - return token.fail(); - }; - - const bool notNumeric = std::any_of(std::begin(tokens), - std::end(tokens), notNumber); - if(notNumeric) - { - for(size_t i = 0; i != tokens.size(); ++i) + token.str(tokens[i]); + token>>matrix.at(row, i); + if (token.fail()) // if not number, map it to datasetmapper { const eT val = static_cast(info.MapString(tokens[i], row)); matrix.at(row, i) = val; } - } - else - { - std::stringstream token; - for(size_t i = 0; i != tokens.size(); ++i) - { - token.str(tokens[i]); - token>>matrix.at(row, i); - token.clear(); - } + token.clear(); } } @@ -411,7 +394,7 @@ bool Load(const std::string& filename, type = "raw ASCII-formatted data"; Log::Info << "Loading '" << filename << "' as " << type << ". " - << std::endl; + << std::flush; std::string separators; if (commas) separators = ","; @@ -496,7 +479,7 @@ bool Load(const std::string& filename, else if (extension == "arff") { Log::Info << "Loading '" << filename << "' as ARFF dataset. " - << std::endl; + << std::flush; try { LoadARFF(filename, matrix, info); diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index 6c3d1d11174..c5df0236428 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -29,13 +29,12 @@ class MissingPolicy MissingPolicy() { - Log::Debug << "MissingPolicy()" << std::endl; + // Nothing to initialize here. } explicit MissingPolicy(std::set missingSet) : missingSet(std::move(missingSet)) { - Log::Debug << "MissingPolicy(missingSet)" << std::endl; // Nothing to initialize here. } @@ -49,7 +48,6 @@ class MissingPolicy // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. - Log::Debug << "missingSet has: " << missingSet.count(string) << std::endl; if (missingSet.count(string) != 0 && (maps.count(dimension) == 0 || maps[dimension].first.left.count(string) == 0)) @@ -64,7 +62,6 @@ class MissingPolicy else { // This string already exists in the mapping. - Log::Debug << "string already exists in the mapping" << std::endl; return maps[dimension].first.left.at(string); } } diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 6857352a37b..a0b0a131b9b 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -92,10 +92,8 @@ int main(int argc, char** argv) // Policy tells how the DatasetMapper should map the values. std::set missingSet; missingSet.insert(missingValue); - Log::Debug << "initalize MissingPolicy(missingSet)" << endl; MissingPolicy policy(missingSet); using MapperType = DatasetMapper; - Log::Debug << "initalize info(policy)" << endl; DatasetMapper info(policy); Load(inputFile, input, info, true, true); @@ -149,9 +147,6 @@ int main(int argc, char** argv) } } - // for testing purpose - Log::Info << "output::" << endl; - Log::Info << output << endl; if (!outputFile.empty()) { diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 87469452222..c7d6d4c38e4 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -33,7 +33,7 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) fstream f; f.open("test_file.csv", fstream::out); f << "a, 2, 3" << endl; - f << "5, 6, 7" << endl; + f << "5, 6, b" << endl; f << "8, 9, 10" << endl; f.close(); @@ -43,6 +43,7 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) std::set mset; mset.insert("a"); + mset.insert("b"); MissingPolicy miss(mset); DatasetMapper info(miss); BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true); @@ -56,15 +57,16 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) DatasetMapper, CustomImputation> imputer(info); imputer.Impute(input, output, "a", 99, dimension); // convert a -> 99 + imputer.Impute(input, output, "b", 99, dimension); // convert b -> 99 BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 1), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 2), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 0), 2.0, 1e-5); BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 2), 7.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 0), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 1), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 1), 99.0, 1e-5); BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5); // Remove the file. From 896a01819631f3a4a0d7ab3fb08189d02435f1c9 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 4 Jul 2016 11:23:27 +0900 Subject: [PATCH 24/40] MissingPolicy uses NaN instead of numbers --- src/mlpack/core/data/dataset_info.hpp | 2 +- src/mlpack/core/data/dataset_info_impl.hpp | 2 +- .../imputation_methods/custom_imputation.hpp | 7 ++++-- .../imputation_methods/listwise_deletion.hpp | 7 ++++-- .../imputation_methods/mean_imputation.hpp | 7 ++++-- .../imputation_methods/median_imputation.hpp | 6 +++-- .../data/map_policies/increment_policy.hpp | 2 +- .../core/data/map_policies/missing_policy.hpp | 14 +++++++----- src/mlpack/tests/imputation_test.cpp | 22 ++++++++++++++----- 9 files changed, 47 insertions(+), 22 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index d87e027dd85..14f2e1c7a91 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -108,7 +108,7 @@ class DatasetMapper } //! Return the policy of the mapper. - PolicyType& Policy() const; + const PolicyType& Policy() const; //! Modify the policy of the mapper (be careful!). PolicyType& Policy(); diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index 1c350974ae8..0f886885019 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -114,7 +114,7 @@ inline size_t DatasetMapper::Dimensionality() const } template -inline PolicyType& DatasetMapper::Policy() const +inline const PolicyType& DatasetMapper::Policy() const { return this->policy; } diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp index 73100e20460..fc95e30c881 100644 --- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -8,6 +8,7 @@ #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_IMPUTATION_HPP #include +#include using namespace std; @@ -33,7 +34,8 @@ class CustomImputation { for (size_t i = 0; i < input.n_cols; ++i) { - if (input(dimension, i) == mappedValue) + if (input(dimension, i) == mappedValue || + std::isnan(input(dimension, i))) { output(dimension, i) = customValue; } @@ -43,7 +45,8 @@ class CustomImputation { for (size_t i = 0; i < input.n_rows; ++i) { - if (input(i, dimension) == mappedValue) + if (input(i, dimension) == mappedValue || + std::isnan(input(i, dimension))) { output(i, dimension) = customValue; } diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp index f957a85bdbc..19487fa5f1a 100644 --- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp +++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp @@ -8,6 +8,7 @@ #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_LISTWISE_DELETION_HPP #include +#include using namespace std; @@ -36,7 +37,8 @@ class ListwiseDeletion { for (size_t i = 0; i < input.n_cols; ++i) { - if (input(dimension, i) == mappedValue) + if (input(dimension, i) == mappedValue || + std::isnan(input(dimension, i))) { output.shed_col(i - count); count++; @@ -47,7 +49,8 @@ class ListwiseDeletion { for (size_t i = 0; i < input.n_rows; ++i)\ { - if (input(i, dimension) == mappedValue) + if (input(i, dimension) == mappedValue || + std::isnan(input(i, dimension))) { output.shed_row(i - count); count++; diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp index 43f14a045ad..3c3f8530f93 100644 --- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -8,6 +8,7 @@ #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_IMPUTATION_HPP #include +#include using namespace std; @@ -44,7 +45,8 @@ class MeanImputation { for (size_t i = 0; i < input.n_cols; ++i) { - if (input(dimension, i) == mappedValue) + if (input(dimension, i) == mappedValue || + std::isnan(input(dimension, i))) { targets.emplace_back(dimension, i); } @@ -59,7 +61,8 @@ class MeanImputation { for (size_t i = 0; i < input.n_rows; ++i) { - if (input(i, dimension) == mappedValue) + if (input(i, dimension) == mappedValue || + std::isnan(input(i, dimension))) { targets.emplace_back(i, dimension); } diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp index 05eff340d70..c46d326d27c 100644 --- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -36,7 +36,8 @@ class MedianImputation arma::Mat medianMat = arma::median(input, 1); for (size_t i = 0; i < input.n_cols; ++i) { - if (input(dimension, i) == mappedValue) + if (input(dimension, i) == mappedValue || + std::isnan(input(dimension, i))) { output(dimension, i) = medianMat(dimension, 0); } @@ -47,7 +48,8 @@ class MedianImputation arma::Mat medianMat = arma::median(input, 0); for (size_t i = 0; i < input.n_rows; ++i) { - if (input(i, dimension) == mappedValue) + if (input(i, dimension) == mappedValue || + std::isnan(input(i, dimension))) { output(i, dimension) = medianMat(0, dimension); } diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp index d4b104b5285..3aa09560347 100644 --- a/src/mlpack/core/data/map_policies/increment_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_policy.hpp @@ -46,7 +46,7 @@ class IncrementPolicy if (numMappings == 0) types[dimension] = Datatype::categorical; - typedef boost::bimap::value_type PairType; + typedef boost::bimap::value_type PairType; maps[dimension].first.insert(PairType(string, numMappings)); return numMappings++; } diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index c5df0236428..b041fe114d0 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -11,6 +11,7 @@ #include #include #include +#include using namespace std; @@ -25,7 +26,7 @@ class MissingPolicy { public: // typedef of mapped_type - using mapped_type = size_t; + using mapped_type = double; MissingPolicy() { @@ -48,21 +49,24 @@ class MissingPolicy // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. + const double NaN = std::numeric_limits::quiet_NaN(); if (missingSet.count(string) != 0 && (maps.count(dimension) == 0 || maps[dimension].first.left.count(string) == 0)) { // This string does not exist yet. size_t& numMappings = maps[dimension].second; + numMappings++; - typedef boost::bimap::value_type PairType; - maps[dimension].first.insert(PairType(string, numMappings)); - return numMappings++; + typedef boost::bimap::value_type PairType; + maps[dimension].first.insert(PairType(string, NaN)); + return NaN; } else { // This string already exists in the mapping. - return maps[dimension].first.left.at(string); + //return maps[dimension].first.left.at(string); + return NaN; } } private: diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index c7d6d4c38e4..6f88ca2775d 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -33,13 +33,12 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) fstream f; f.open("test_file.csv", fstream::out); f << "a, 2, 3" << endl; - f << "5, 6, b" << endl; + f << "5, 6, a" << endl; f << "8, 9, 10" << endl; f.close(); arma::mat input; arma::mat output; - size_t dimension = 0; std::set mset; mset.insert("a"); @@ -48,17 +47,28 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) DatasetMapper info(miss); BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true); + // row and column test BOOST_REQUIRE_EQUAL(input.n_rows, 3); BOOST_REQUIRE_EQUAL(input.n_cols, 3); - /* TODO: Connect Load with the new DatasetMapper instead of DatasetInfo*/ + // Load check + // MissingPolicy should convert strings to nans + BOOST_REQUIRE(std::isnan(output(0, 0))); + BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 0), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5); + BOOST_REQUIRE(std::isnan(output(2, 1))); + BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5); Imputer, CustomImputation> imputer(info); - imputer.Impute(input, output, "a", 99, dimension); // convert a -> 99 - imputer.Impute(input, output, "b", 99, dimension); // convert b -> 99 + imputer.Impute(input, output, "a", 99, 0); // convert a -> 99 for dimension 0 + // Custom imputation result check BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5); @@ -66,7 +76,7 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5); BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 1), 99.0, 1e-5); + BOOST_REQUIRE(std::isnan(output(2, 1))); // remains as NaN BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5); // Remove the file. From 1a908c2a7b014b825667da28526b30e45dfea084 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Tue, 5 Jul 2016 03:33:52 +0900 Subject: [PATCH 25/40] fix reference issue in DatasetMapper --- src/mlpack/core/data/dataset_info.hpp | 2 +- src/mlpack/core/data/dataset_info_impl.hpp | 4 ++-- .../core/data/map_policies/missing_policy.hpp | 6 +++--- src/mlpack/tests/imputation_test.cpp | 20 +++++++++---------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 14f2e1c7a91..f9aac151cad 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -114,7 +114,7 @@ class DatasetMapper PolicyType& Policy(); //! Modify (Replace) the policy of the mapper with a new policy - void Policy(PolicyType& policy); + void Policy(PolicyType&& policy); private: //! Types of each dimension. diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index 0f886885019..d1bd1cf55d8 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -126,9 +126,9 @@ inline PolicyType& DatasetMapper::Policy() } template -inline void DatasetMapper::Policy(PolicyType& policy) +inline void DatasetMapper::Policy(PolicyType&& policy) { - this->policy = std::move(policy); + this->policy = std::forward(policy); } diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index b041fe114d0..6b1fee972c8 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -56,16 +56,16 @@ class MissingPolicy { // This string does not exist yet. size_t& numMappings = maps[dimension].second; - numMappings++; typedef boost::bimap::value_type PairType; maps[dimension].first.insert(PairType(string, NaN)); + + ++numMappings; return NaN; } else { - // This string already exists in the mapping. - //return maps[dimension].first.left.at(string); + // This string already exists in the mapping or . return NaN; } } diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 6f88ca2775d..9a37939f3a3 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -53,15 +53,15 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) // Load check // MissingPolicy should convert strings to nans - BOOST_REQUIRE(std::isnan(output(0, 0))); - BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 0), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5); - BOOST_REQUIRE(std::isnan(output(2, 1))); - BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5); + BOOST_REQUIRE(std::isnan(input(0, 0)) == true); + BOOST_REQUIRE_CLOSE(input(0, 1), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 2), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 0), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 2), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 0), 3.0, 1e-5); + BOOST_REQUIRE(std::isnan(input(2, 1)) == true); + BOOST_REQUIRE_CLOSE(input(2, 2), 10.0, 1e-5); Imputer, @@ -76,7 +76,7 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5); BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5); - BOOST_REQUIRE(std::isnan(output(2, 1))); // remains as NaN + BOOST_REQUIRE(std::isnan(output(2, 1)) == true); // remains as NaN BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5); // Remove the file. From 2edbc400adac676fa2909724a608adf96a70e024 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Wed, 6 Jul 2016 03:52:13 +0900 Subject: [PATCH 26/40] Move MapToNumerical(MapTokens) to Policy class --- src/mlpack/core/data/dataset_info.hpp | 5 +++ src/mlpack/core/data/dataset_info_impl.hpp | 11 +++++ src/mlpack/core/data/load_impl.hpp | 26 +----------- .../data/map_policies/increment_policy.hpp | 40 +++++++++++++++++++ .../core/data/map_policies/missing_policy.hpp | 23 +++++++++++ 5 files changed, 81 insertions(+), 24 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index f9aac151cad..c50454081bf 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -78,6 +78,11 @@ class DatasetMapper typename PolicyType::mapped_type UnmapValue(const std::string& string, const size_t dimension); + template + void MapTokens(const std::vector& tokens, + size_t& row, + arma::Mat& matrix); + //! Return the type of a given dimension (numeric or categorical). Datatype Type(const size_t dimension) const; //! Modify the type of a given dimension (be careful!). diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index d1bd1cf55d8..015a03afada 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -76,6 +76,17 @@ inline typename PolicyType::mapped_type DatasetMapper::UnmapValue( return maps[dimension].first.left.at(string); } +template +template +inline void DatasetMapper::MapTokens( + const std::vector& tokens, + size_t& row, + arma::Mat& matrix) +{ + return policy.template MapTokens(tokens, row, matrix, maps, + types); +} + // Get the type of a particular dimension. template inline Datatype DatasetMapper::Type(const size_t dimension) const diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index f521be4ca00..45266b52644 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -59,26 +59,6 @@ void TransPoseTokens(std::vector> const &input, } } -template -void MapToNumerical(const std::vector& tokens, - size_t& row, - DatasetMapper& info, - arma::Mat& matrix) -{ - std::stringstream token; - for (size_t i = 0; i != tokens.size(); ++i) - { - token.str(tokens[i]); - token>>matrix.at(row, i); - if (token.fail()) // if not number, map it to datasetmapper - { - const eT val = static_cast(info.MapString(tokens[i], row)); - matrix.at(row, i) = val; - } - token.clear(); - } -} - } template @@ -458,8 +438,7 @@ bool Load(const std::string& filename, for(size_t i = 0; i != cols; ++i) { details::TransPoseTokens(tokensArray, tokens, i); - details::MapToNumerical(tokens, i, - info, matrix); + info.MapTokens(tokens, i, matrix); } } else @@ -470,8 +449,7 @@ bool Load(const std::string& filename, // Extract line by line. std::getline(stream, buffer, '\n'); Tokenizer lineTok(buffer, sep); - details::MapToNumerical(details::ToTokens(lineTok), row, - info, matrix); + info.MapTokens(details::ToTokens(lineTok), row, matrix); ++row; } } diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp index 3aa09560347..68a474fc7a3 100644 --- a/src/mlpack/core/data/map_policies/increment_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_policy.hpp @@ -56,6 +56,46 @@ class IncrementPolicy return maps[dimension].first.left.at(string); } } + + template + void MapTokens(const std::vector& tokens, + size_t& row, + arma::Mat& matrix, + MapType& maps, + std::vector& types) + { + auto notNumber = [](const std::string& str) + { + eT val(0); + std::stringstream token; + token.str(str); + token >> val; + return token.fail(); + }; + + const bool notNumeric = std::any_of(std::begin(tokens), + std::end(tokens), notNumber); + if (notNumeric) + { + for (size_t i = 0; i != tokens.size(); ++i) + { + const eT val = static_cast(this->MapString(maps, types, tokens[i], + row)); + double temp = (double) val; + matrix.at(row, i) = val; + } + } + else + { + std::stringstream token; + for (size_t i = 0; i != tokens.size(); ++i) + { + token.str(tokens[i]); + token >> matrix.at(row, i); + token.clear(); + } + } + } }; // class IncrementPolicy } // namespace data diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index 6b1fee972c8..59be01087d8 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -69,6 +69,29 @@ class MissingPolicy return NaN; } } + + template + void MapTokens(const std::vector& tokens, + size_t& row, + arma::Mat& matrix, + MapType& maps, + std::vector& types) + { + std::stringstream token; + for (size_t i = 0; i != tokens.size(); ++i) + { + token.str(tokens[i]); + token>>matrix.at(row, i); + if (token.fail()) // if not number, map it to datasetmapper + { + const eT val = static_cast(this->MapString(maps, types, tokens[i], + row)); + matrix.at(row, i) = val; + } + token.clear(); + } + } + private: std::set missingSet; }; // class MissingPolicy From d881cb7e229e9ed5c56922aec3c45d5ddc5518e0 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Wed, 6 Jul 2016 04:10:34 +0900 Subject: [PATCH 27/40] make policy and imputation api more consistent --- src/mlpack/core/data/dataset_info_impl.hpp | 2 +- .../imputation_methods/custom_imputation.hpp | 12 ++++++------ .../imputation_methods/listwise_deletion.hpp | 10 +++++----- .../data/imputation_methods/mean_imputation.hpp | 2 +- .../imputation_methods/median_imputation.hpp | 2 +- src/mlpack/core/data/imputer.hpp | 10 +++------- .../core/data/map_policies/increment_policy.hpp | 12 ++++++------ .../core/data/map_policies/missing_policy.hpp | 12 ++++++------ src/mlpack/tests/imputation_test.cpp | 16 ++++++++-------- 9 files changed, 37 insertions(+), 41 deletions(-) diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index 015a03afada..6bb7d759bb4 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -37,7 +37,7 @@ inline typename PolicyType::mapped_type DatasetMapper::MapString( const std::string& string, const size_t dimension) { - return policy.template MapString(maps, types, string, dimension); + return policy.template MapString(string, dimension, maps, types); } // Return the string corresponding to a value in a given dimension. diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp index fc95e30c881..83bde9df5f9 100644 --- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -19,12 +19,12 @@ template class CustomImputation { public: - void Apply(const arma::Mat& input, - arma::Mat& output, - const T& mappedValue, - const T& customValue, - const size_t dimension, - const bool transpose = true) + void Impute(const arma::Mat& input, + arma::Mat& output, + const T& mappedValue, + const T& customValue, + const size_t dimension, + const bool transpose = true) { // initiate output output = input; diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp index 19487fa5f1a..9f3a7d2e51f 100644 --- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp +++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp @@ -23,11 +23,11 @@ template class ListwiseDeletion { public: - void Apply(const arma::Mat& input, - arma::Mat& output, - const T& mappedValue, - const size_t dimension, - const bool transpose = true) + void Impute(const arma::Mat& input, + arma::Mat& output, + const T& mappedValue, + const size_t dimension, + const bool transpose = true) { // initiate output output = input; diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp index 3c3f8530f93..157ba530615 100644 --- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -22,7 +22,7 @@ template class MeanImputation { public: - void Apply (const arma::Mat& input, + void Impute(const arma::Mat& input, arma::Mat& output, const T& mappedValue, const size_t dimension, diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp index c46d326d27c..0035be91a3c 100644 --- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -22,7 +22,7 @@ template class MedianImputation { public: - void Apply (const arma::Mat& input, + void Impute(const arma::Mat& input, arma::Mat& output, const T& mappedValue, const size_t dimension, diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index 45966392d63..3bc9cb591fb 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -60,7 +60,7 @@ class Imputer const size_t dimension) { T mappedValue = static_cast(mapper.UnmapValue(missingValue, dimension)); - strategy.Apply(input, output, mappedValue, dimension, transpose); + strategy.Impute(input, output, mappedValue, dimension, transpose); } /** @@ -74,12 +74,8 @@ class Imputer const size_t dimension) { T mappedValue = static_cast(mapper.UnmapValue(missingValue, dimension)); - strategy.Apply(input, - output, - mappedValue, - customValue, - dimension, - transpose); + strategy.Impute(input, output, mappedValue, customValue, dimension, + transpose); } //! Get the strategy diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp index 68a474fc7a3..4a971d21f83 100644 --- a/src/mlpack/core/data/map_policies/increment_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_policy.hpp @@ -28,10 +28,10 @@ class IncrementPolicy using mapped_type = size_t; template - mapped_type MapString(MapType& maps, - std::vector& types, - const std::string& string, - const size_t dimension) + mapped_type MapString(const std::string& string, + const size_t dimension, + MapType& maps, + std::vector& types) { // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, @@ -79,8 +79,8 @@ class IncrementPolicy { for (size_t i = 0; i != tokens.size(); ++i) { - const eT val = static_cast(this->MapString(maps, types, tokens[i], - row)); + const eT val = static_cast(this->MapString(tokens[i], row, maps, + types)); double temp = (double) val; matrix.at(row, i) = val; } diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index 59be01087d8..88fbb39dd5a 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -41,10 +41,10 @@ class MissingPolicy template - mapped_type MapString(MapType& maps, - std::vector& types, - const std::string& string, - const size_t dimension) + mapped_type MapString(const std::string& string, + const size_t dimension, + MapType maps, + std::vector& types) { // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, @@ -84,8 +84,8 @@ class MissingPolicy token>>matrix.at(row, i); if (token.fail()) // if not number, map it to datasetmapper { - const eT val = static_cast(this->MapString(maps, types, tokens[i], - row)); + const eT val = static_cast(this->MapString(tokens[i], row, maps, + types)); matrix.at(row, i) = val; } token.clear(); diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 9a37939f3a3..f02e97e0018 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -99,7 +99,7 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) CustomImputation imputer; // transposed - imputer.Apply(input, outputT, mappedValue, customValue, 0/*dimension*/, true); + imputer.Impute(input, outputT, mappedValue, customValue, 0/*dimension*/, true); BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(0, 1), 99.0, 1e-5); @@ -115,7 +115,7 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); // not transposed - imputer.Apply(input, output, mappedValue, customValue, 1, false); + imputer.Impute(input, output, mappedValue, customValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 1), 99.0, 1e-5); @@ -146,7 +146,7 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest) MeanImputation imputer; // transposed - imputer.Apply(input, outputT, mappedValue, 0, true); + imputer.Impute(input, outputT, mappedValue, 0, true); BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(0, 1), 2.5, 1e-5); @@ -162,7 +162,7 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest) BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); // not transposed - imputer.Apply(input, output, mappedValue, 1, false); + imputer.Impute(input, output, mappedValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 1), 7.0, 1e-5); @@ -193,7 +193,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) MedianImputation imputer; // transposed - imputer.Apply(input, outputT, mappedValue, 1, true); + imputer.Impute(input, outputT, mappedValue, 1, true); BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(0, 1), 0.0, 1e-5); @@ -209,7 +209,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); // not transposed - imputer.Apply(input, output, mappedValue, 1, false); + imputer.Impute(input, output, mappedValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 1), 6.0, 1e-5); @@ -240,7 +240,7 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) ListwiseDeletion imputer; // transposed - imputer.Apply(input, outputT, mappedValue, 0, true); // transposed + imputer.Impute(input, outputT, mappedValue, 0, true); // transposed BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(0, 1), 2.0, 1e-5); @@ -250,7 +250,7 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) BOOST_REQUIRE_CLOSE(outputT(2, 1), 4.0, 1e-5); // not transposed - imputer.Apply(input, output, mappedValue, 1, false); // not transposed + imputer.Impute(input, output, mappedValue, 1, false); // not transposed BOOST_REQUIRE_CLOSE(output(0, 0), 5.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 1), 6.0, 1e-5); From a8818316a04506530e2269a2e0a32ba2f6a1c83b Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Wed, 6 Jul 2016 21:29:34 +0900 Subject: [PATCH 28/40] numerical values can be set as missing values --- src/mlpack/core/data/map_policies/missing_policy.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index 88fbb39dd5a..c68fc27d148 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -80,6 +80,12 @@ class MissingPolicy std::stringstream token; for (size_t i = 0; i != tokens.size(); ++i) { + if (missingSet.find(tokens[i]) != std::end(missingSet)) + { + const eT val = static_cast(this->MapString(tokens[i], row, maps, + types)); + matrix.at(row, i) = val; + } token.str(tokens[i]); token>>matrix.at(row, i); if (token.fail()) // if not number, map it to datasetmapper From 63268a3f1cc1ace8143ae0e6f9e8d9aa81822fd2 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Fri, 8 Jul 2016 03:52:03 +0900 Subject: [PATCH 29/40] add comments and use more proper names --- src/mlpack/core/data/CMakeLists.txt | 4 +- .../{dataset_info.hpp => dataset_mapper.hpp} | 15 +-- ..._info_impl.hpp => dataset_mapper_impl.hpp} | 29 +++--- .../imputation_methods/custom_imputation.hpp | 20 +++- .../imputation_methods/listwise_deletion.hpp | 19 ++-- .../imputation_methods/mean_imputation.hpp | 18 +++- .../imputation_methods/median_imputation.hpp | 19 +++- src/mlpack/core/data/imputer.hpp | 29 +++--- src/mlpack/core/data/load.hpp | 2 +- .../core/data/map_policies/datatype.hpp | 2 - .../data/map_policies/increment_policy.hpp | 53 ++++++++--- .../core/data/map_policies/missing_policy.hpp | 77 +++++++++++---- .../preprocess/preprocess_imputer_main.cpp | 94 +++++++++++-------- src/mlpack/tests/imputation_test.cpp | 2 +- 14 files changed, 258 insertions(+), 125 deletions(-) rename src/mlpack/core/data/{dataset_info.hpp => dataset_mapper.hpp} (94%) rename src/mlpack/core/data/{dataset_info_impl.hpp => dataset_mapper_impl.hpp} (78%) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index 65e4fc3e8b3..2fbd5d32e62 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -1,8 +1,8 @@ # Define the files that we need to compile. # Anything not in this list will not be compiled into mlpack. set(SOURCES - dataset_info.hpp - dataset_info_impl.hpp + dataset_mapper.hpp + dataset_mapper_impl.hpp extension.hpp format.hpp load.hpp diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_mapper.hpp similarity index 94% rename from src/mlpack/core/data/dataset_info.hpp rename to src/mlpack/core/data/dataset_mapper.hpp index c50454081bf..ab9340c9818 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_mapper.hpp @@ -1,6 +1,7 @@ /** - * @file dataset_info.hpp + * @file dataset_mapper.hpp * @author Ryan Curtin + * @author Keon Kim * * Defines the DatasetMapper class, which holds information about a dataset. * This is useful when the dataset contains categorical non-numeric features @@ -53,7 +54,7 @@ class DatasetMapper * @param string String to find/create mapping for. * @param dimension Index of the dimension of the string. */ - typename PolicyType::mapped_type MapString(const std::string& string, + typename PolicyType::MappedType MapString(const std::string& string, const size_t dimension); /** @@ -75,13 +76,13 @@ class DatasetMapper * @param string Mapped string for value. * @param dimension Dimension to unmap string from. */ - typename PolicyType::mapped_type UnmapValue(const std::string& string, + typename PolicyType::MappedType UnmapValue(const std::string& string, const size_t dimension); template void MapTokens(const std::vector& tokens, - size_t& row, - arma::Mat& matrix); + size_t& row, + arma::Mat& matrix); //! Return the type of a given dimension (numeric or categorical). Datatype Type(const size_t dimension) const; @@ -126,7 +127,7 @@ class DatasetMapper std::vector types; // BiMapType definition - using BiMapType = boost::bimap; + using BiMapType = boost::bimap; // Mappings from strings to integers. // Map entries will only exist for dimensions that are categorical. @@ -143,6 +144,6 @@ using DatasetInfo = DatasetMapper; } // namespace data } // namespace mlpack -#include "dataset_info_impl.hpp" +#include "dataset_mapper_impl.hpp" #endif diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_mapper_impl.hpp similarity index 78% rename from src/mlpack/core/data/dataset_info_impl.hpp rename to src/mlpack/core/data/dataset_mapper_impl.hpp index 6bb7d759bb4..6b291e2d368 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_mapper_impl.hpp @@ -1,6 +1,7 @@ /** - * @file dataset_info_impl.hpp + * @file dataset_mapper_impl.hpp * @author Ryan Curtin + * @author Keon Kim * * An implementation of the DatasetMapper class. */ @@ -8,7 +9,7 @@ #define MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP // In case it hasn't already been included. -#include "dataset_info.hpp" +#include "dataset_mapper.hpp" namespace mlpack { namespace data { @@ -23,7 +24,7 @@ inline DatasetMapper::DatasetMapper(const size_t dimensionality) : template inline DatasetMapper::DatasetMapper(PolicyType& policy, - const size_t dimensionality) : + const size_t dimensionality) : types(dimensionality, Datatype::numeric), policy(std::move(policy)) { @@ -33,9 +34,9 @@ inline DatasetMapper::DatasetMapper(PolicyType& policy, // When we want to insert value into the map, // we could use the policy to map the string template -inline typename PolicyType::mapped_type DatasetMapper::MapString( - const std::string& string, - const size_t dimension) +inline typename PolicyType::MappedType DatasetMapper::MapString( + const std::string& string, + const size_t dimension) { return policy.template MapString(string, dimension, maps, types); } @@ -43,8 +44,8 @@ inline typename PolicyType::mapped_type DatasetMapper::MapString( // Return the string corresponding to a value in a given dimension. template inline const std::string& DatasetMapper::UnmapString( - const size_t value, - const size_t dimension) + const size_t value, + const size_t dimension) { // Throw an exception if the value doesn't exist. if (maps[dimension].first.right.count(value) == 0) @@ -60,9 +61,9 @@ inline const std::string& DatasetMapper::UnmapString( // Return the value corresponding to a string in a given dimension. template -inline typename PolicyType::mapped_type DatasetMapper::UnmapValue( - const std::string& string, - const size_t dimension) +inline typename PolicyType::MappedType DatasetMapper::UnmapValue( + const std::string& string, + const size_t dimension) { // Throw an exception if the value doesn't exist. if (maps[dimension].first.left.count(string) == 0) @@ -79,9 +80,9 @@ inline typename PolicyType::mapped_type DatasetMapper::UnmapValue( template template inline void DatasetMapper::MapTokens( - const std::vector& tokens, - size_t& row, - arma::Mat& matrix) + const std::vector& tokens, + size_t& row, + arma::Mat& matrix) { return policy.template MapTokens(tokens, row, matrix, maps, types); diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp index 83bde9df5f9..1698ba94eba 100644 --- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -8,17 +8,29 @@ #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_IMPUTATION_HPP #include -#include - -using namespace std; namespace mlpack { namespace data { - +/** + * A simple custom imputation class + * @tparam T Type of armadillo matrix + */ template class CustomImputation { public: + /** + * Impute function searches through the input looking for mappedValue and + * replaces it with the user-defined custom value of the given dimension. + * The result is saved to the output. + * + * @param input Matrix that contains mappedValue. + * @param output Matrix that the result will be saved into. + * @param mappedValue Value that the user wants to get rid of. + * @param customValue Value that the user wants to replace mappedValue with. + * @param dimension Index of the dimension of the mappedValue. + * @param transpose State of whether the input matrix is transposed or not. + */ void Impute(const arma::Mat& input, arma::Mat& output, const T& mappedValue, diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp index 9f3a7d2e51f..06db83aba09 100644 --- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp +++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp @@ -8,21 +8,28 @@ #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_LISTWISE_DELETION_HPP #include -#include - -using namespace std; namespace mlpack { namespace data { - /** - * complete-case analysis. + * A complete-case analysis to remove the values containing mappedValue. * Removes all data for a case that has one or more missing values. + * @tparam T Type of armadillo matrix */ template class ListwiseDeletion { public: + /** + * Impute function searches through the input looking for mappedValue and + * remove the whole row or column. The result is saved to the output. + * + * @param input Matrix that contains mappedValue. + * @param output Matrix that the result will be saved into. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param transpose State of whether the input matrix is transposed or not. + */ void Impute(const arma::Mat& input, arma::Mat& output, const T& mappedValue, @@ -47,7 +54,7 @@ class ListwiseDeletion } else { - for (size_t i = 0; i < input.n_rows; ++i)\ + for (size_t i = 0; i < input.n_rows; ++i) { if (input(i, dimension) == mappedValue || std::isnan(input(i, dimension))) diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp index 157ba530615..05134e5c552 100644 --- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -8,20 +8,28 @@ #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_IMPUTATION_HPP #include -#include - -using namespace std; namespace mlpack { namespace data { - /** - * A simple mean imputation + * A simple mean imputation class + * @tparam T Type of armadillo matrix */ template class MeanImputation { public: + /** + * Impute function searches through the input looking for mappedValue and + * replaces it with the mean of the given dimension. The result is saved + * to the output. + * + * @param input Matrix that contains mappedValue. + * @param output Matrix that the result will be saved into. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param transpose State of whether the input matrix is transposed or not. + */ void Impute(const arma::Mat& input, arma::Mat& output, const T& mappedValue, diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp index 0035be91a3c..8a111d4ee29 100644 --- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -9,19 +9,28 @@ #include -using namespace std; - namespace mlpack { namespace data { - /** - * A simple median imputation + * This is a class implementation of simple median imputation. * replace missing value with middle or average of middle values + * @tparam T Type of armadillo matrix */ template class MedianImputation { public: + /** + * Impute function searches through the input looking for mappedValue and + * replaces it with the median of the given dimension. The result is saved + * to the output. + * + * @param input Matrix that contains mappedValue. + * @param output Matrix that the result will be saved into. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param transpose State of whether the input matrix is transposed or not. + */ void Impute(const arma::Mat& input, arma::Mat& output, const T& mappedValue, @@ -56,7 +65,7 @@ class MedianImputation } } } -}; // class MeanImputation +}; // class MedianImputation } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index 3bc9cb591fb..b719ba29cba 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -9,7 +9,9 @@ #define MLPACK_CORE_DATA_IMPUTER_HPP #include -#include "dataset_info.hpp" +#include "dataset_mapper.hpp" +#include "map_policies/missing_policy.hpp" +#include "map_policies/increment_policy.hpp" namespace mlpack { namespace data { @@ -27,22 +29,18 @@ class Imputer { public: Imputer(MapperType mapper, bool transpose = true): - mapper(std::move(mapper)), - transpose(transpose) + mapper(std::move(mapper)), + transpose(transpose) { - //static_assert(std::is_same::type, - //data::IncrementPolicy>::value, "The type of MapperType must be " - //"IncrementPolicy"); + // Nothing to initialize here. } Imputer(MapperType mapper, StrategyType strategy, bool transpose = true): - strategy(std::move(strategy)), - mapper(std::move(mapper)), - transpose(transpose) + strategy(std::move(strategy)), + mapper(std::move(mapper)), + transpose(transpose) { - //static_assert(std::is_same::type, - //data::IncrementPolicy>::value, "The type of MapperType must be " - //"IncrementPolicy"); + // Nothing to initialize here. } /** @@ -66,6 +64,13 @@ class Imputer /** * This overload of Impute() lets users to define custom value that can be * replaced with the target value. + * + * @param input Input dataset to apply imputation. + * @param output Armadillo matrix to save the results + * @oaran missingValue User defined missing value; it can be anything. + * @param customValue The numeric value that a user wants to replace + * missingValue with. + * @param dimension Dimension to apply the imputation. */ void Impute(const arma::Mat& input, arma::Mat& output, diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 4b5debe1275..40d3834e3fe 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -14,7 +14,7 @@ #include #include "format.hpp" -#include "dataset_info.hpp" +#include "dataset_mapper.hpp" namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { diff --git a/src/mlpack/core/data/map_policies/datatype.hpp b/src/mlpack/core/data/map_policies/datatype.hpp index 0cafba8e672..3a3b1ac137e 100644 --- a/src/mlpack/core/data/map_policies/datatype.hpp +++ b/src/mlpack/core/data/map_policies/datatype.hpp @@ -10,7 +10,6 @@ namespace mlpack { namespace data { - /** * The Datatype enum specifies the types of data mlpack algorithms can use. * The vast majority of mlpack algorithms can only use numeric data (i.e. @@ -23,7 +22,6 @@ enum Datatype : bool /* [> bool is all the precision we need for two types <] */ categorical = 1 }; - } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp index 4a971d21f83..4ff7341a653 100644 --- a/src/mlpack/core/data/map_policies/increment_policy.hpp +++ b/src/mlpack/core/data/map_policies/increment_policy.hpp @@ -12,26 +12,39 @@ #include #include -using namespace std; - namespace mlpack { namespace data { - /** - * This class is used to map strings to incrementing unsigned integers (size_t). - * First string to be mapped will be mapped to 0, next to 1 and so on. + * IncrementPolicy is used as a helper class for DatasetMapper. It tells how the + * strings should be mapped. Purpose of this policy is to map all dimension if + * one if the variables in a dimension turns out to be a categorical variable. + * IncrementPolicy maps strings to incrementing unsigned integers (size_t). + * The first string to be mapped will be mapped to 0, the next to 1 and so on. */ class IncrementPolicy { public: - // typedef of mapped_type - using mapped_type = size_t; + // typedef of MappedType + using MappedType = size_t; + /** + * Given the string and the dimension to which the it belongs, and the maps + * and types given by the DatasetMapper class, returns its numeric mapping. + * If no mapping yet exists, the string is added to the list of mappings for + * the given dimension. This function is used as a helper function for + * DatasetMapper class. + * + * @tparam MapType Type of unordered_map that contains mapped value pairs + * @param string String to find/create mapping for. + * @param dimension Index of the dimension of the string. + * @param maps Unordered map given by the DatasetMapper. + * @param types Vector containing the type information about each dimensions. + */ template - mapped_type MapString(const std::string& string, - const size_t dimension, - MapType& maps, - std::vector& types) + MappedType MapString(const std::string& string, + const size_t dimension, + MapType& maps, + std::vector& types) { // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, @@ -46,7 +59,7 @@ class IncrementPolicy if (numMappings == 0) types[dimension] = Datatype::categorical; - typedef boost::bimap::value_type PairType; + typedef boost::bimap::value_type PairType; maps[dimension].first.insert(PairType(string, numMappings)); return numMappings++; } @@ -57,6 +70,21 @@ class IncrementPolicy } } + /** + * MapTokens turns vector of strings into numeric variables and puts them + * into a given matrix. It is used as a helper function when trying to load + * files. Each dimension's tokens are given in to this function. If one of the + * tokens turns out to be a string, all the tokens should be mapped using the + * MapString() funciton. + * + * @tparam eT Type of armadillo matrix. + * @tparam MapType Type of unordered_map that contains mapped value pairs. + * @param tokens Vector of variables inside a dimension. + * @param row Position of the given tokens. + * @param matrix Matrix to save the data into. + * @param maps Maps given by the DatasetMapper class. + * @param types Types of each dimensions given by the DatasetMapper class. + */ template void MapTokens(const std::vector& tokens, size_t& row, @@ -81,7 +109,6 @@ class IncrementPolicy { const eT val = static_cast(this->MapString(tokens[i], row, maps, types)); - double temp = (double) val; matrix.at(row, i) = val; } } diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index c68fc27d148..ead543ad6ae 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -13,38 +13,58 @@ #include #include - -using namespace std; - namespace mlpack { namespace data { - /** - * Same as increment map policy, but does not change type of features. + * MissingPolicy is used as a helper class for DatasetMapper. It tells how the + * strings should be mapped. Purpose of this policy is to map all user-defined + * missing variables into maps so that users can decide what to do with the + * corrupted data. User-defined missing variables are given by the missingSet. + * Note that MissingPolicy does not change type of features. */ class MissingPolicy { public: - // typedef of mapped_type - using mapped_type = double; + // typedef of MappedType + using MappedType = double; MissingPolicy() { // Nothing to initialize here. } + /** + * Create the MissingPolicy object with the given missingSet. Note that the + * missingSet cannot be changed later; you will have to create a new + * MissingPolicy object. + * + * @param missingSet Set of strings that should be mapped. + */ explicit MissingPolicy(std::set missingSet) : - missingSet(std::move(missingSet)) + missingSet(std::move(missingSet)) { // Nothing to initialize here. } - + /** + * Given the string and the dimension to which it belongs by the user, and + * the maps and types given by the DatasetMapper class, returns its numeric + * mapping. If no mapping yet exists and the string is included in the + * missingSet, the string is added to the list of mappings for the given + * dimension. This function is used as a helper function for DatasetMapper + * class. + * + * @tparam MapType Type of unordered_map that contains mapped value pairs + * @param string String to find/create mapping for. + * @param dimension Index of the dimension of the string. + * @param maps Unordered map given by the DatasetMapper. + * @param types Vector containing the type information about each dimensions. + */ template - mapped_type MapString(const std::string& string, - const size_t dimension, - MapType maps, - std::vector& types) + MappedType MapString(const std::string& string, + const size_t dimension, + MapType maps, + std::vector& types) { // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, @@ -57,7 +77,7 @@ class MissingPolicy // This string does not exist yet. size_t& numMappings = maps[dimension].second; - typedef boost::bimap::value_type PairType; + typedef boost::bimap::value_type PairType; maps[dimension].first.insert(PairType(string, NaN)); ++numMappings; @@ -65,11 +85,28 @@ class MissingPolicy } else { - // This string already exists in the mapping or . + // This string already exists in the mapping + // or not included in missingSet. return NaN; } } + /** + * MapTokens turns vector of strings into numeric variables and puts them + * into a given matrix. It is used as a helper function when trying to load + * files. Each dimension's tokens are given in to this function. If one of the + * tokens turns out to be a string or one of the missingSet's variables, only + * the token responsible for it should be mapped using the MapString() + * funciton. + * + * @tparam eT Type of armadillo matrix. + * @tparam MapType Type of unordered_map that contains mapped value pairs. + * @param tokens Vector of variables inside a dimension. + * @param row Position of the given tokens. + * @param matrix Matrix to save the data into. + * @param maps Maps given by the DatasetMapper class. + * @param types Types of each dimensions given by the DatasetMapper class. + */ template void MapTokens(const std::vector& tokens, size_t& row, @@ -77,9 +114,14 @@ class MissingPolicy MapType& maps, std::vector& types) { + // MissingPolicy allows double type matrix only, because it uses NaN. + static_assert(std::is_same::value, "You must use double type " + " matrix in order to apply MissingPolicy"); + std::stringstream token; for (size_t i = 0; i != tokens.size(); ++i) { + // if token is a number, but is included in the missingSet, map it. if (missingSet.find(tokens[i]) != std::end(missingSet)) { const eT val = static_cast(this->MapString(tokens[i], row, maps, @@ -88,7 +130,8 @@ class MissingPolicy } token.str(tokens[i]); token>>matrix.at(row, i); - if (token.fail()) // if not number, map it to datasetmapper + // if the token is not number, map it. + if (token.fail()) { const eT val = static_cast(this->MapString(tokens[i], row, maps, types)); @@ -99,6 +142,8 @@ class MissingPolicy } private: + // Note that missingSet and maps are different. + // missingSet specifies which value/string should be mapped. std::set missingSet; }; // class MissingPolicy diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index a0b0a131b9b..603cdcc233a 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -7,7 +7,7 @@ */ #include #include -#include +#include #include #include #include @@ -19,8 +19,8 @@ PROGRAM_INFO("Impute Data", "This utility takes a dataset and converts user " "defined missing variable to another to provide more meaningful analysis " "\n\n" "The program does not modify the original file, but instead makes a " - "separate file to save the output data; The program requires you to " - "specify the file name with --output_file (-o)." + "separate file to save the output data; You can save the output by " + "specifying the file name with --output_file (-o)." "\n\n" "For example, if we consider 'NULL' in dimension 0 to be a missing " "variable and want to delete whole row containing the NULL in the " @@ -43,7 +43,6 @@ using namespace data; int main(int argc, char** argv) { - // Parse command line options. CLI::ParseCommandLine(argc, argv); const string inputFile = CLI::GetParam("input_file"); @@ -53,21 +52,25 @@ int main(int argc, char** argv) const size_t dimension = (size_t) CLI::GetParam("dimension"); string strategy = CLI::GetParam("strategy"); - // missing value should be specified + // The program needs user-defined missing values. + // Missing values can be any list of strings such as "1", "a", "NULL". if (!CLI::HasParam("missing_value")) Log::Fatal << "--missing_value must be specified in order to perform " << "any imputation strategies." << endl; - // warn if user did not specify output_file + if (!CLI::HasParam("strategy")) + Log::Fatal << "--strategy must be specified in order to perform " + << "imputation."<< endl; + if (!CLI::HasParam("output_file")) Log::Warn << "--output_file is not specified, no " << "results from this program will be saved!" << endl; - // warn if user did not specify dimension if (!CLI::HasParam("dimension")) - Log::Warn << "--dimension is required to be specified!" << endl; + Log::Warn << "--dimension is not specified, the imputation will be " + << "applied to all dimensions."<< endl; - // if custom value is specified, and imputation strategy is not, + // If custom value is specified, and imputation strategy is not, // set imputation strategy to "custom" if (CLI::HasParam("custom_value") && !CLI::HasParam("impute_strategy")) { @@ -76,7 +79,7 @@ int main(int argc, char** argv) << "--impute_strategy is automatically set to 'custom'." << endl; } - // custom value and any other impute strategies cannot be specified at + // Custom value and any other impute strategies cannot be specified at // the same time. if (CLI::HasParam("custom_value") && CLI::HasParam("impute_strategy") && strategy != "custom") @@ -89,6 +92,7 @@ int main(int argc, char** argv) << "'custom' strategy" << endl; arma::mat input; + arma::mat output; // Policy tells how the DatasetMapper should map the values. std::set missingSet; missingSet.insert(missingValue); @@ -98,9 +102,6 @@ int main(int argc, char** argv) Load(inputFile, input, info, true, true); - // for testing purpose - Log::Info << input << endl; - // print how many mapping exist in each dimensions for (size_t i = 0; i < input.n_rows; ++i) { @@ -108,49 +109,68 @@ int main(int argc, char** argv) << endl; } - arma::Mat output(input); - - Log::Info << "Performing '" << strategy << "' imputation strategy " - << "on dimension '" << dimension << "'." << endl; - - // custom strategy only - if (strategy == "custom") + // default imputer is mean imputation (to provide scope) + Imputer> impu(info); + if (strategy == "median") + { + Imputer> impu(info); + } + else if (strategy == "listwise_deletion") { - Log::Info << "Replacing all '" << missingValue << "' with '" << customValue - << "'." << endl; - Imputer> impu(info); - impu.Impute(input, output, missingValue, customValue, dimension); + Imputer> impu(info); } else { - Log::Info << "Replacing all '" << missingValue << "' with '" - << strategy << "' strategy." << endl; + Log::Fatal << "'" << strategy << "' imputation strategy does not exist" + << endl; + } - if (strategy == "mean") + // Initialize imputer class + + if (CLI::HasParam("dimension")) + { + // when --dimension is specified, + // the program will apply the changes to only the given dimension. + Log::Info << "Performing '" << strategy << "' imputation strategy " + << "to replace '" << missingValue << "' on all dimensions." << endl; + + if (strategy == "custom") { - Imputer> impu(info); - impu.Impute(input, output, missingValue, dimension); + Imputer> impu(info); + impu.Impute(input, output, missingValue, customValue, dimension); } - else if (strategy == "median") + else { - Imputer> impu(info); impu.Impute(input, output, missingValue, dimension); } - else if (strategy == "listwise_deletion") + } + else + { + // when --dimension is not specified, + // the program will apply the changes to all dimensions. + Log::Info << "Performing '" << strategy << "' imputation strategy " + << "to replace '" << missingValue << "' on all dimensions." << endl; + + if (strategy == "custom") { - Imputer> impu(info); - impu.Impute(input, output, missingValue, dimension); + Imputer> impu(info); + for (size_t i = 0; i < input.n_rows; ++i) + { + impu.Impute(input, output, missingValue, customValue, i); + } } else { - Log::Warn << "You did not choose any imputation strategy" << endl; + for (size_t i = 0; i < input.n_rows; ++i) + { + impu.Impute(input, output, missingValue, i); + } } } - if (!outputFile.empty()) { - Log::Info << "Saving model to '" << outputFile << "'." << endl; + Log::Info << "Saving results to '" << outputFile << "'." << endl; Save(outputFile, output, false); } } diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index f02e97e0018..e118bfb90d9 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include #include From 2eb675467528574b65fa23d78e4e8d3e6e6ea6c5 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 11 Jul 2016 06:26:48 +0900 Subject: [PATCH 30/40] modify custom impute interface and rename variables --- src/mlpack/core/data/dataset_mapper.hpp | 1 + .../imputation_methods/custom_imputation.hpp | 20 +++++-- .../imputation_methods/listwise_deletion.hpp | 6 +- .../imputation_methods/mean_imputation.hpp | 6 +- .../imputation_methods/median_imputation.hpp | 6 +- src/mlpack/core/data/imputer.hpp | 36 +++-------- .../core/data/map_policies/missing_policy.hpp | 20 +++---- .../preprocess/preprocess_imputer_main.cpp | 59 ++++++++----------- src/mlpack/tests/data/impute_test.csv | 3 + src/mlpack/tests/imputation_test.cpp | 16 ++--- 10 files changed, 76 insertions(+), 97 deletions(-) create mode 100644 src/mlpack/tests/data/impute_test.csv diff --git a/src/mlpack/core/data/dataset_mapper.hpp b/src/mlpack/core/data/dataset_mapper.hpp index ab9340c9818..0001438eb5d 100644 --- a/src/mlpack/core/data/dataset_mapper.hpp +++ b/src/mlpack/core/data/dataset_mapper.hpp @@ -131,6 +131,7 @@ class DatasetMapper // Mappings from strings to integers. // Map entries will only exist for dimensions that are categorical. + // MapType = map, numMappings>> using MapType = std::unordered_map>; MapType maps; diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp index 1698ba94eba..a34658b7ee8 100644 --- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -19,30 +19,35 @@ template class CustomImputation { public: + CustomImputation(T customValue): + customValue(std::move(customValue)) + { + // nothing to initialize here + } + /** * Impute function searches through the input looking for mappedValue and * replaces it with the user-defined custom value of the given dimension. - * The result is saved to the output. + * The result is saved to the output. Custom value must be set when + * initializing the CustomImputation object. * * @param input Matrix that contains mappedValue. * @param output Matrix that the result will be saved into. * @param mappedValue Value that the user wants to get rid of. - * @param customValue Value that the user wants to replace mappedValue with. * @param dimension Index of the dimension of the mappedValue. - * @param transpose State of whether the input matrix is transposed or not. + * @param columnMajor State of whether the input matrix is columnMajord or not. */ void Impute(const arma::Mat& input, arma::Mat& output, const T& mappedValue, - const T& customValue, const size_t dimension, - const bool transpose = true) + const bool columnMajor = true) { // initiate output output = input; // replace the target value to custom value - if (transpose) + if (columnMajor) { for (size_t i = 0; i < input.n_cols; ++i) { @@ -65,6 +70,9 @@ class CustomImputation } } } + + private: + T customValue; }; // class CustomImputation } // namespace data diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp index 06db83aba09..9a695a69845 100644 --- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp +++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp @@ -28,19 +28,19 @@ class ListwiseDeletion * @param output Matrix that the result will be saved into. * @param mappedValue Value that the user wants to get rid of. * @param dimension Index of the dimension of the mappedValue. - * @param transpose State of whether the input matrix is transposed or not. + * @param columnMajor State of whether the input matrix is columnMajor or not. */ void Impute(const arma::Mat& input, arma::Mat& output, const T& mappedValue, const size_t dimension, - const bool transpose = true) + const bool columnMajor = true) { // initiate output output = input; size_t count = 0; - if (transpose) + if (columnMajor) { for (size_t i = 0; i < input.n_cols; ++i) { diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp index 05134e5c552..c4085c61c89 100644 --- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -28,13 +28,13 @@ class MeanImputation * @param output Matrix that the result will be saved into. * @param mappedValue Value that the user wants to get rid of. * @param dimension Index of the dimension of the mappedValue. - * @param transpose State of whether the input matrix is transposed or not. + * @param columnMajor State of whether the input matrix is columnMajor or not. */ void Impute(const arma::Mat& input, arma::Mat& output, const T& mappedValue, const size_t dimension, - const bool transpose = true) + const bool columnMajor = true) { // initiate output output = input; @@ -49,7 +49,7 @@ class MeanImputation // calculate number of elements and sum of them excluding mapped value or // nan. while doing that, remember where mappedValue or NaN exists. - if (transpose) + if (columnMajor) { for (size_t i = 0; i < input.n_cols; ++i) { diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp index 8a111d4ee29..00223668649 100644 --- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -29,18 +29,18 @@ class MedianImputation * @param output Matrix that the result will be saved into. * @param mappedValue Value that the user wants to get rid of. * @param dimension Index of the dimension of the mappedValue. - * @param transpose State of whether the input matrix is transposed or not. + * @param columnMajor State of whether the input matrix is columnMajor or not. */ void Impute(const arma::Mat& input, arma::Mat& output, const T& mappedValue, const size_t dimension, - const bool transpose = true) + const bool columnMajor = true) { //initiate output output = input; - if (transpose) + if (columnMajor) { arma::Mat medianMat = arma::median(input, 1); for (size_t i = 0; i < input.n_cols; ++i) diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index b719ba29cba..a30508b22f7 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -28,17 +28,17 @@ template class Imputer { public: - Imputer(MapperType mapper, bool transpose = true): + Imputer(MapperType mapper, bool columnMajor = true): mapper(std::move(mapper)), - transpose(transpose) + columnMajor(columnMajor) { // Nothing to initialize here. } - Imputer(MapperType mapper, StrategyType strategy, bool transpose = true): + Imputer(MapperType mapper, StrategyType strategy, bool columnMajor = true): strategy(std::move(strategy)), mapper(std::move(mapper)), - transpose(transpose) + columnMajor(columnMajor) { // Nothing to initialize here. } @@ -58,29 +58,7 @@ class Imputer const size_t dimension) { T mappedValue = static_cast(mapper.UnmapValue(missingValue, dimension)); - strategy.Impute(input, output, mappedValue, dimension, transpose); - } - - /** - * This overload of Impute() lets users to define custom value that can be - * replaced with the target value. - * - * @param input Input dataset to apply imputation. - * @param output Armadillo matrix to save the results - * @oaran missingValue User defined missing value; it can be anything. - * @param customValue The numeric value that a user wants to replace - * missingValue with. - * @param dimension Dimension to apply the imputation. - */ - void Impute(const arma::Mat& input, - arma::Mat& output, - const std::string& missingValue, - const T& customValue, - const size_t dimension) - { - T mappedValue = static_cast(mapper.UnmapValue(missingValue, dimension)); - strategy.Impute(input, output, mappedValue, customValue, dimension, - transpose); + strategy.Impute(input, output, mappedValue, dimension, columnMajor); } //! Get the strategy @@ -102,8 +80,8 @@ class Imputer // DatasetMapperType MapperType mapper; - // save transpose as a member variable since it is rarely changed. - bool transpose; + // save columnMajor as a member variable since it is rarely changed. + bool columnMajor; }; // class Imputer diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp index ead543ad6ae..ff60a5a0ce5 100644 --- a/src/mlpack/core/data/map_policies/missing_policy.hpp +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -63,9 +63,11 @@ class MissingPolicy template MappedType MapString(const std::string& string, const size_t dimension, - MapType maps, + MapType& maps, std::vector& types) { + // mute the unused parameter warning (does nothing here.) + (void)types; // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, // we create a mapping. @@ -75,11 +77,10 @@ class MissingPolicy maps[dimension].first.left.count(string) == 0)) { // This string does not exist yet. - size_t& numMappings = maps[dimension].second; - typedef boost::bimap::value_type PairType; maps[dimension].first.insert(PairType(string, NaN)); + size_t& numMappings = maps[dimension].second; ++numMappings; return NaN; } @@ -87,6 +88,9 @@ class MissingPolicy { // This string already exists in the mapping // or not included in missingSet. + // Unlike IncrementPolicy, MissingPolicy counts all mapped values. + size_t& numMappings = maps[dimension].second; + ++numMappings; return NaN; } } @@ -121,17 +125,11 @@ class MissingPolicy std::stringstream token; for (size_t i = 0; i != tokens.size(); ++i) { - // if token is a number, but is included in the missingSet, map it. - if (missingSet.find(tokens[i]) != std::end(missingSet)) - { - const eT val = static_cast(this->MapString(tokens[i], row, maps, - types)); - matrix.at(row, i) = val; - } token.str(tokens[i]); token>>matrix.at(row, i); // if the token is not number, map it. - if (token.fail()) + // or if token is a number, but is included in the missingSet, map it. + if (token.fail() || missingSet.find(tokens[i]) != std::end(missingSet)) { const eT val = static_cast(this->MapString(tokens[i], row, maps, types)); diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 603cdcc233a..e367b6a0594 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -72,16 +72,16 @@ int main(int argc, char** argv) // If custom value is specified, and imputation strategy is not, // set imputation strategy to "custom" - if (CLI::HasParam("custom_value") && !CLI::HasParam("impute_strategy")) + if (CLI::HasParam("custom_value") && !CLI::HasParam("strategy")) { strategy = "custom"; - Log::Warn << "--custom_value is specified without --impute_strategy, " - << "--impute_strategy is automatically set to 'custom'." << endl; + Log::Warn << "--custom_value is specified without --strategy, " + << "--strategy is automatically set to 'custom'." << endl; } // Custom value and any other impute strategies cannot be specified at // the same time. - if (CLI::HasParam("custom_value") && CLI::HasParam("impute_strategy") && + if (CLI::HasParam("custom_value") && CLI::HasParam("strategy") && strategy != "custom") Log::Fatal << "--custom_value cannot be specified with " << "impute strategies excluding 'custom' strategy" << endl; @@ -109,15 +109,26 @@ int main(int argc, char** argv) << endl; } - // default imputer is mean imputation (to provide scope) - Imputer> impu(info); - if (strategy == "median") + Log::Info << input << endl; + + // Initialize imputer class + Imputer> imputer(info); + if (strategy == "mean") + { + Imputer> imputer(info); + } + else if (strategy == "median") { - Imputer> impu(info); + Imputer> imputer(info); } else if (strategy == "listwise_deletion") { - Imputer> impu(info); + Imputer> imputer(info); + } + else if (strategy == "custom") + { + CustomImputation strat(customValue); + Imputer> imputer(info, strat); } else { @@ -125,24 +136,15 @@ int main(int argc, char** argv) << endl; } - // Initialize imputer class - if (CLI::HasParam("dimension")) { // when --dimension is specified, // the program will apply the changes to only the given dimension. Log::Info << "Performing '" << strategy << "' imputation strategy " - << "to replace '" << missingValue << "' on all dimensions." << endl; + << "to replace '" << missingValue << "' on dimension " << dimension + << "." << endl; - if (strategy == "custom") - { - Imputer> impu(info); - impu.Impute(input, output, missingValue, customValue, dimension); - } - else - { - impu.Impute(input, output, missingValue, dimension); - } + imputer.Impute(input, output, missingValue, dimension); } else { @@ -151,20 +153,9 @@ int main(int argc, char** argv) Log::Info << "Performing '" << strategy << "' imputation strategy " << "to replace '" << missingValue << "' on all dimensions." << endl; - if (strategy == "custom") - { - Imputer> impu(info); - for (size_t i = 0; i < input.n_rows; ++i) - { - impu.Impute(input, output, missingValue, customValue, i); - } - } - else + for (size_t i = 0; i < input.n_rows; ++i) { - for (size_t i = 0; i < input.n_rows; ++i) - { - impu.Impute(input, output, missingValue, i); - } + imputer.Impute(input, output, missingValue, i); } } diff --git a/src/mlpack/tests/data/impute_test.csv b/src/mlpack/tests/data/impute_test.csv new file mode 100644 index 00000000000..06256a4772d --- /dev/null +++ b/src/mlpack/tests/data/impute_test.csv @@ -0,0 +1,3 @@ +a, 2, 3 +5, 6, a +1, 9, 1 diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index e118bfb90d9..08ef4e184d2 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -42,9 +42,8 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) std::set mset; mset.insert("a"); - mset.insert("b"); - MissingPolicy miss(mset); - DatasetMapper info(miss); + MissingPolicy policy(mset); + DatasetMapper info(policy); BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true); // row and column test @@ -63,10 +62,11 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) BOOST_REQUIRE(std::isnan(input(2, 1)) == true); BOOST_REQUIRE_CLOSE(input(2, 2), 10.0, 1e-5); + CustomImputation customStrategy(99); // convert missing vals to 99. Imputer, - CustomImputation> imputer(info); - imputer.Impute(input, output, "a", 99, 0); // convert a -> 99 for dimension 0 + CustomImputation> imputer(info, customStrategy); + imputer.Impute(input, output, "a", 0); // convert a -> 99 for dimension 0 // Custom imputation result check BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5); @@ -96,10 +96,10 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) double customValue = 99; double mappedValue = 0.0; - CustomImputation imputer; + CustomImputation imputer(customValue); // transposed - imputer.Impute(input, outputT, mappedValue, customValue, 0/*dimension*/, true); + imputer.Impute(input, outputT, mappedValue, 0/*dimension*/, true); BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(0, 1), 99.0, 1e-5); @@ -115,7 +115,7 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); // not transposed - imputer.Impute(input, output, mappedValue, customValue, 1, false); + imputer.Impute(input, output, mappedValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 1), 99.0, 1e-5); From 6d43aa3b3dcd93fdc1bd3e9918267b59f762f3a1 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 11 Jul 2016 08:08:24 +0900 Subject: [PATCH 31/40] add input-only overloads to imputation methods --- .../imputation_methods/custom_imputation.hpp | 43 ++++++++++- .../imputation_methods/listwise_deletion.hpp | 62 +++++++++++++--- .../imputation_methods/mean_imputation.hpp | 73 +++++++++++++++++++ .../imputation_methods/median_imputation.hpp | 41 +++++++++++ src/mlpack/core/data/imputer.hpp | 21 +++++- src/mlpack/tests/imputation_test.cpp | 59 ++++++++++++++- 6 files changed, 285 insertions(+), 14 deletions(-) diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp index a34658b7ee8..35326a77391 100644 --- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -35,7 +35,7 @@ class CustomImputation * @param output Matrix that the result will be saved into. * @param mappedValue Value that the user wants to get rid of. * @param dimension Index of the dimension of the mappedValue. - * @param columnMajor State of whether the input matrix is columnMajord or not. + * @param columnMajor State of whether the input matrix is columnMajor or not. */ void Impute(const arma::Mat& input, arma::Mat& output, @@ -71,6 +71,47 @@ class CustomImputation } } + /** + * Impute function searches through the input looking for mappedValue and + * replaces it with the user-defined custom value of the given dimension. + * The result is overwritten to the input, not creating any copy. Custom value + * must be set when initializing the CustomImputation object. + * + * @param input Matrix that contains mappedValue. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param columnMajor State of whether the input matrix is columnMajor or not. + */ + void Impute(arma::Mat& input, + const T& mappedValue, + const size_t dimension, + const bool columnMajor = true) + { + // replace the target value to custom value + if (columnMajor) + { + for (size_t i = 0; i < input.n_cols; ++i) + { + if (input(dimension, i) == mappedValue || + std::isnan(input(dimension, i))) + { + input(dimension, i) = customValue; + } + } + } + else + { + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(i, dimension) == mappedValue || + std::isnan(input(i, dimension))) + { + input(i, dimension) = customValue; + } + } + } + } + private: T customValue; }; // class CustomImputation diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp index 9a695a69845..0ac84ae110e 100644 --- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp +++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp @@ -36,33 +36,73 @@ class ListwiseDeletion const size_t dimension, const bool columnMajor = true) { - // initiate output - output = input; - size_t count = 0; + std::vector colsToKeep; if (columnMajor) { for (size_t i = 0; i < input.n_cols; ++i) { - if (input(dimension, i) == mappedValue || - std::isnan(input(dimension, i))) + if (!(input(dimension, i) == mappedValue || + std::isnan(input(dimension, i)))) { - output.shed_col(i - count); - count++; + colsToKeep.push_back(i); } } + output = input.cols(arma::uvec(colsToKeep)); } else { for (size_t i = 0; i < input.n_rows; ++i) { - if (input(i, dimension) == mappedValue || - std::isnan(input(i, dimension))) + if (!(input(i, dimension) == mappedValue || + std::isnan(input(i, dimension)))) { - output.shed_row(i - count); - count++; + colsToKeep.push_back(i); } } + output = input.rows(arma::uvec(colsToKeep)); + } + } + + /** + * Impute function searches through the input looking for mappedValue and + * remove the whole row or column. The result is overwritten to the input. + * + * @param input Matrix that contains mappedValue. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param columnMajor State of whether the input matrix is columnMajor or not. + */ + void Impute(arma::Mat& input, + const T& mappedValue, + const size_t dimension, + const bool columnMajor = true) + { + std::vector colsToKeep; + + if (columnMajor) + { + for (size_t i = 0; i < input.n_cols; ++i) + { + if (!(input(dimension, i) == mappedValue || + std::isnan(input(dimension, i)))) + { + colsToKeep.push_back(i); + } + } + input = input.cols(arma::uvec(colsToKeep)); + } + else + { + for (size_t i = 0; i < input.n_rows; ++i) + { + if (!(input(i, dimension) == mappedValue || + std::isnan(input(i, dimension)))) + { + colsToKeep.push_back(i); + } + } + input = input.rows(arma::uvec(colsToKeep)); } } }; // class ListwiseDeletion diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp index c4085c61c89..cfe0de16920 100644 --- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -96,6 +96,79 @@ class MeanImputation output(target.first, target.second) = mean; } } + + /** + * Impute function searches through the input looking for mappedValue and + * replaces it with the mean of the given dimension. The result is overwritten + * to the input matrix. + * + * @param input Matrix that contains mappedValue. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param columnMajor State of whether the input matrix is columnMajor or not. + */ + void Impute(arma::Mat& input, + const T& mappedValue, + const size_t dimension, + const bool columnMajor = true) + { + double sum = 0; + size_t elems = 0; // excluding nan or missing target + + using PairType = std::pair; + // dimensions and indexes are saved as pairs inside this vector. + std::vector targets; + + + // calculate number of elements and sum of them excluding mapped value or + // nan. while doing that, remember where mappedValue or NaN exists. + if (columnMajor) + { + for (size_t i = 0; i < input.n_cols; ++i) + { + if (input(dimension, i) == mappedValue || + std::isnan(input(dimension, i))) + { + targets.emplace_back(dimension, i); + } + else + { + elems++; + sum += input(dimension, i); + } + } + } + else + { + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(i, dimension) == mappedValue || + std::isnan(input(i, dimension))) + { + targets.emplace_back(i, dimension); + } + else + { + elems++; + sum += input(i, dimension); + } + } + } + + if (elems == 0) + Log::Fatal << "it is impossible to calculate mean; no valid elements in " + << "the dimension" << std::endl; + + // calculate mean; + const double mean = sum / elems; + + // Now replace the calculated mean to the missing variables + // It only needs to loop through targets vector, not the whole matrix. + for (const PairType& target : targets) + { + input(target.first, target.second) = mean; + } + } }; // class MeanImputation } // namespace data diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp index 00223668649..cf482417f1f 100644 --- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -65,6 +65,47 @@ class MedianImputation } } } + + /** + * Impute function searches through the input looking for mappedValue and + * replaces it with the median of the given dimension. The result is + * overwritten to the input matrix. + * + * @param input Matrix that contains mappedValue. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param columnMajor State of whether the input matrix is columnMajor or not. + */ + void Impute(arma::Mat& input, + const T& mappedValue, + const size_t dimension, + const bool columnMajor = true) + { + if (columnMajor) + { + arma::Mat medianMat = arma::median(input, 1); + for (size_t i = 0; i < input.n_cols; ++i) + { + if (input(dimension, i) == mappedValue || + std::isnan(input(dimension, i))) + { + input(dimension, i) = medianMat(dimension, 0); + } + } + } + else + { + arma::Mat medianMat = arma::median(input, 0); + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(i, dimension) == mappedValue || + std::isnan(input(i, dimension))) + { + input(i, dimension) = medianMat(0, dimension); + } + } + } + } }; // class MedianImputation } // namespace data diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index a30508b22f7..4787343f48d 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -45,7 +45,8 @@ class Imputer /** * Given an input dataset, replace missing values with given imputation - * strategy. + * strategy. This overload saves the result into the output matrix and does not + * change the input matrix. * * @param input Input dataset to apply imputation. * @param output Armadillo matrix to save the results @@ -61,6 +62,24 @@ class Imputer strategy.Impute(input, output, mappedValue, dimension, columnMajor); } + /** + * Given an input dataset, replace missing values with given imputation + * strategy. This overload does not produce output matrix, but overwrites the + * result into the input matrix. + * + * @param input Input dataset to apply imputation. + * @oaran missingValue User defined missing value; it can be anything. + * @param dimension Dimension to apply the imputation. + */ + void Impute(arma::Mat& input, + const std::string& missingValue, + const size_t dimension) + { + T mappedValue = static_cast(mapper.UnmapValue(missingValue, dimension)); + strategy.Impute(input, mappedValue, dimension, columnMajor); + } + + //! Get the strategy const StrategyType& Strategy() const { return strategy; } diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 08ef4e184d2..9b19262bfe8 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -129,6 +129,22 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5); BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5); BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5); + + // overwrite to the input + imputer.Impute(input, mappedValue, 0/*dimension*/, true); + + BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 1), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 3), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5); } /** @@ -176,6 +192,22 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest) BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5); BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5); BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5); + + // overwrite to the input + imputer.Impute(input, mappedValue, 0/*dimension*/, true); + + BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 1), 2.5, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 3), 2.5, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5); } /** @@ -222,7 +254,22 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) BOOST_REQUIRE_CLOSE(output(2, 0), 9.0, 1e-5); BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5); BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5); + + // overwrite to the input + imputer.Impute(input, mappedValue, 1/*dimension*/, true); + + BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 1), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 2), 5.5, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5); } /** @@ -260,6 +307,16 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) BOOST_REQUIRE_CLOSE(output(1, 1), 8.0, 1e-5); BOOST_REQUIRE_CLOSE(output(1, 2), 4.0, 1e-5); BOOST_REQUIRE_CLOSE(output(1, 3), 8.0, 1e-5); + + // overwrite to the input + imputer.Impute(input, mappedValue, 0, true); // transposed + + BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 1), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 1), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 1), 4.0, 1e-5); } From fedc5e0ece901746ad15b9c13244713bb36d4f9e Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 11 Jul 2016 12:10:32 +0900 Subject: [PATCH 32/40] update median imputation to exclude missing values --- .../imputation_methods/mean_imputation.hpp | 1 - .../imputation_methods/median_imputation.hpp | 56 ++++++++++++++++--- .../preprocess/preprocess_imputer_main.cpp | 2 - src/mlpack/tests/imputation_test.cpp | 6 +- 4 files changed, 51 insertions(+), 14 deletions(-) diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp index cfe0de16920..6c6a7e411df 100644 --- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -46,7 +46,6 @@ class MeanImputation // dimensions and indexes are saved as pairs inside this vector. std::vector targets; - // calculate number of elements and sum of them excluding mapped value or // nan. while doing that, remember where mappedValue or NaN exists. if (columnMajor) diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp index cf482417f1f..5c03bc2901d 100644 --- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -40,30 +40,50 @@ class MedianImputation //initiate output output = input; + using PairType = std::pair; + // dimensions and indexes are saved as pairs inside this vector. + std::vector targets; + // good elements are kept inside this vector. + std::vector elemsToKeep; + if (columnMajor) { - arma::Mat medianMat = arma::median(input, 1); for (size_t i = 0; i < input.n_cols; ++i) { if (input(dimension, i) == mappedValue || std::isnan(input(dimension, i))) { - output(dimension, i) = medianMat(dimension, 0); + targets.emplace_back(dimension, i); + } + else + { + elemsToKeep.push_back(input(dimension, i)); } } } else { - arma::Mat medianMat = arma::median(input, 0); for (size_t i = 0; i < input.n_rows; ++i) { if (input(i, dimension) == mappedValue || std::isnan(input(i, dimension))) { - output(i, dimension) = medianMat(0, dimension); + targets.emplace_back(i, dimension); + } + else + { + elemsToKeep.push_back(input(i, dimension)); } } } + + // calculate median + const double median = arma::median(arma::vec(elemsToKeep)); + + for (const PairType& target : targets) + { + output(target.first, target.second) = median; + } } /** @@ -81,30 +101,50 @@ class MedianImputation const size_t dimension, const bool columnMajor = true) { + using PairType = std::pair; + // dimensions and indexes are saved as pairs inside this vector. + std::vector targets; + // good elements are kept inside this vector. + std::vector elemsToKeep; + if (columnMajor) { - arma::Mat medianMat = arma::median(input, 1); for (size_t i = 0; i < input.n_cols; ++i) { if (input(dimension, i) == mappedValue || std::isnan(input(dimension, i))) { - input(dimension, i) = medianMat(dimension, 0); + targets.emplace_back(dimension, i); + } + else + { + elemsToKeep.push_back(input(dimension, i)); } } } else { - arma::Mat medianMat = arma::median(input, 0); for (size_t i = 0; i < input.n_rows; ++i) { if (input(i, dimension) == mappedValue || std::isnan(input(i, dimension))) { - input(i, dimension) = medianMat(0, dimension); + targets.emplace_back(i, dimension); + } + else + { + elemsToKeep.push_back(input(i, dimension)); } } } + + // calculate median + const double median = arma::median(arma::vec(elemsToKeep)); + + for (const PairType& target : targets) + { + input(target.first, target.second) = median; + } } }; // class MedianImputation diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index e367b6a0594..bacc040d570 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -109,8 +109,6 @@ int main(int argc, char** argv) << endl; } - Log::Info << input << endl; - // Initialize imputer class Imputer> imputer(info); if (strategy == "mean") diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 9b19262bfe8..9d79bd9510f 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -233,7 +233,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) BOOST_REQUIRE_CLOSE(outputT(0, 3), 0.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 2), 5.5, 1e-5); + BOOST_REQUIRE_CLOSE(outputT(1, 2), 6.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(1, 3), 6.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(2, 1), 8.0, 1e-5); @@ -244,7 +244,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) imputer.Impute(input, output, mappedValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(output(0, 1), 7.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 2), 2.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 3), 0.0, 1e-5); BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5); @@ -264,7 +264,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) BOOST_REQUIRE_CLOSE(input(0, 3), 0.0, 1e-5); BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5); BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 2), 5.5, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 2), 6.0, 1e-5); BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5); BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5); BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5); From 787fd8245b70dca571411f9b2bedbadf218d7dd6 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 18 Jul 2016 11:26:43 +0900 Subject: [PATCH 33/40] optimize imputation methods with output overloads --- .../imputation_methods/custom_imputation.hpp | 48 +++++++++++--- .../imputation_methods/mean_imputation.hpp | 62 +++++++++++++------ .../imputation_methods/median_imputation.hpp | 58 +++++++++++------ src/mlpack/core/data/imputer.hpp | 1 - 4 files changed, 120 insertions(+), 49 deletions(-) diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp index 35326a77391..f7d8bdee8ea 100644 --- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -43,29 +43,57 @@ class CustomImputation const size_t dimension, const bool columnMajor = true) { - // initiate output - output = input; + // set size of the output + output.set_size(input.n_rows, input.n_cols); // replace the target value to custom value if (columnMajor) { - for (size_t i = 0; i < input.n_cols; ++i) + for (size_t row = 0; row < input.n_rows; ++row) { - if (input(dimension, i) == mappedValue || - std::isnan(input(dimension, i))) + for (size_t col = 0; col < input.n_cols; ++col) { - output(dimension, i) = customValue; + if (row == dimension) + { + if (input(row, col) == mappedValue || + std::isnan(input(row, col))) + { + output(row, col) = customValue; + } + else + { + output(row, col) = input(row, col); + } + } + else + { + output(row, col) = input(row, col); + } } } } else { - for (size_t i = 0; i < input.n_rows; ++i) + for (size_t col = 0; col < input.n_cols; ++ col) { - if (input(i, dimension) == mappedValue || - std::isnan(input(i, dimension))) + for (size_t row = 0; row < input.n_rows; ++row) { - output(i, dimension) = customValue; + if (col == dimension) + { + if (input(row, col) == mappedValue || + std::isnan(input(row, col))) + { + output(row, col) = customValue; + } + else + { + output(row, col) = input(row, col); + } + } + else + { + output(row, col) = input(row, col); + } } } } diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp index 6c6a7e411df..b276ca8cbfb 100644 --- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -36,8 +36,8 @@ class MeanImputation const size_t dimension, const bool columnMajor = true) { - // initiate output - output = input; + // set size of the output + output.set_size(input.n_rows, input.n_cols); double sum = 0; size_t elems = 0; // excluding nan or missing target @@ -50,33 +50,55 @@ class MeanImputation // nan. while doing that, remember where mappedValue or NaN exists. if (columnMajor) { - for (size_t i = 0; i < input.n_cols; ++i) + for (size_t row = 0; row < input.n_rows; ++row) { - if (input(dimension, i) == mappedValue || - std::isnan(input(dimension, i))) + for (size_t col = 0; col < input.n_cols; ++col) { - targets.emplace_back(dimension, i); - } - else - { - elems++; - sum += input(dimension, i); + if (row == dimension) + { + if (input(row, col) == mappedValue || + std::isnan(input(row, col))) + { + targets.emplace_back(row, col); + } + else + { + elems++; + sum += input(row, col); + output(row, col) = input(row, col); + } + } + else + { + output(row, col) = input(row, col); + } } } } else { - for (size_t i = 0; i < input.n_rows; ++i) + for (size_t col = 0; col < input.n_cols; ++col) { - if (input(i, dimension) == mappedValue || - std::isnan(input(i, dimension))) + for (size_t row = 0; row < input.n_rows; ++row) { - targets.emplace_back(i, dimension); - } - else - { - elems++; - sum += input(i, dimension); + if (col == dimension) + { + if (input(row, col) == mappedValue || + std::isnan(input(row, col))) + { + targets.emplace_back(row, col); + } + else + { + elems++; + sum += input(row, col); + output(row, col) = input(row, col); + } + } + else + { + output(row, col) = input(row, col); + } } } } diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp index 5c03bc2901d..658816e65a4 100644 --- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -37,8 +37,8 @@ class MedianImputation const size_t dimension, const bool columnMajor = true) { - //initiate output - output = input; + // set size of the output + output.set_size(input.n_rows, input.n_cols); using PairType = std::pair; // dimensions and indexes are saved as pairs inside this vector. @@ -48,31 +48,53 @@ class MedianImputation if (columnMajor) { - for (size_t i = 0; i < input.n_cols; ++i) + for (size_t row = 0; row < input.n_rows; ++row) { - if (input(dimension, i) == mappedValue || - std::isnan(input(dimension, i))) - { - targets.emplace_back(dimension, i); - } - else + for (size_t col = 0; col < input.n_cols; ++col) { - elemsToKeep.push_back(input(dimension, i)); + if (row == dimension) + { + if (input(row, col) == mappedValue || + std::isnan(input(row, col))) + { + targets.emplace_back(row, col); + } + else + { + elemsToKeep.push_back(input(row, col)); + output(row, col) = input(row, col); + } + } + else + { + output(row, col) = input(row, col); + } } } } else { - for (size_t i = 0; i < input.n_rows; ++i) + for (size_t col = 0; col < input.n_cols; ++col) { - if (input(i, dimension) == mappedValue || - std::isnan(input(i, dimension))) - { - targets.emplace_back(i, dimension); - } - else + for (size_t row = 0; row < input.n_rows; ++row) { - elemsToKeep.push_back(input(i, dimension)); + if (col == dimension) + { + if (input(row, col) == mappedValue || + std::isnan(input(row, col))) + { + targets.emplace_back(row, col); + } + else + { + elemsToKeep.push_back(input(row, col)); + output(row, col) = input(row, col); + } + } + else + { + output(row, col) = input(row, col); + } } } } diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index 4787343f48d..f6134a7bb25 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -79,7 +79,6 @@ class Imputer strategy.Impute(input, mappedValue, dimension, columnMajor); } - //! Get the strategy const StrategyType& Strategy() const { return strategy; } From a0b7d590025de5cb92c67643fb44801e038eede6 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 18 Jul 2016 11:38:21 +0900 Subject: [PATCH 34/40] expressive comments in imputation_test --- src/mlpack/tests/imputation_test.cpp | 62 +++++++++++++++------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 9d79bd9510f..13b4614b563 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -24,9 +24,11 @@ using namespace mlpack::data; using namespace std; BOOST_AUTO_TEST_SUITE(ImputationTest); - /** - * Make sure a CSV is loaded correctly. + * 1. Make sure a CSV is loaded correctly with mappings using MissingPolicy. + * 2. Try Imputer object with CustomImputation method to impute data "a". + * (It is ok to test on one method since the other ones will be covered in the + * next cases). */ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) { @@ -62,11 +64,13 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) BOOST_REQUIRE(std::isnan(input(2, 1)) == true); BOOST_REQUIRE_CLOSE(input(2, 2), 10.0, 1e-5); - CustomImputation customStrategy(99); // convert missing vals to 99. + // convert missing vals to 99. + CustomImputation customStrategy(99); Imputer, CustomImputation> imputer(info, customStrategy); - imputer.Impute(input, output, "a", 0); // convert a -> 99 for dimension 0 + // convert a or nan to 99 for dimension 0 + imputer.Impute(input, output, "a", 0); // Custom imputation result check BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5); @@ -84,21 +88,21 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) } /** - * Make sure a CSV is loaded correctly. + * Make sure CustomImputation method replaces data 0 to 99. */ BOOST_AUTO_TEST_CASE(CustomImputationTest) { arma::mat input("3.0 0.0 2.0 0.0;" "5.0 6.0 0.0 6.0;" "9.0 8.0 4.0 8.0;"); - arma::mat outputT; // assume input is transposed - arma::mat output; // assume input is not transposed + arma::mat outputT; // assume input is column wise + arma::mat output; // assume input is row wise double customValue = 99; double mappedValue = 0.0; CustomImputation imputer(customValue); - // transposed + // column wise imputer.Impute(input, outputT, mappedValue, 0/*dimension*/, true); BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); @@ -114,7 +118,7 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) BOOST_REQUIRE_CLOSE(outputT(2, 2), 4.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); - // not transposed + // row wise imputer.Impute(input, output, mappedValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); @@ -148,20 +152,21 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) } /** - * Make sure a CSV is loaded correctly. + * Make sure MeanImputation method replaces data 0 to mean value of each + * dimensions. */ BOOST_AUTO_TEST_CASE(MeanImputationTest) { arma::mat input("3.0 0.0 2.0 0.0;" "5.0 6.0 0.0 6.0;" "9.0 8.0 4.0 8.0;"); - arma::mat outputT; // assume input is transposed - arma::mat output; // assume input is not transposed + arma::mat outputT; // assume input is column wise + arma::mat output; // assume input is row wise double mappedValue = 0.0; MeanImputation imputer; - // transposed + // column wise imputer.Impute(input, outputT, mappedValue, 0, true); BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); @@ -177,7 +182,7 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest) BOOST_REQUIRE_CLOSE(outputT(2, 2), 4.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); - // not transposed + // row wise imputer.Impute(input, output, mappedValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); @@ -211,20 +216,21 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest) } /** - * Make sure a CSV is loaded correctly. + * Make sure MeanImputation method replaces data 0 to median value of each + * dimensions. */ BOOST_AUTO_TEST_CASE(MedianImputationTest) { arma::mat input("3.0 0.0 2.0 0.0;" "5.0 6.0 0.0 6.0;" "9.0 8.0 4.0 8.0;"); - arma::mat outputT; // assume input is transposed - arma::mat output; // assume input is not transposed + arma::mat outputT; // assume input is column wise + arma::mat output; // assume input is row wise double mappedValue = 0.0; MedianImputation imputer; - // transposed + // column wise imputer.Impute(input, outputT, mappedValue, 1, true); BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); @@ -240,7 +246,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) BOOST_REQUIRE_CLOSE(outputT(2, 2), 4.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); - // not transposed + // row wise imputer.Impute(input, output, mappedValue, 1, false); BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); @@ -273,21 +279,22 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) } /** - * Make sure a CSV is loaded correctly. + * Make sure ListwiseDeletion method deletes the whole column (if column wise) + * or the row (if row wise) containing value of 0. */ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) { arma::mat input("3.0 0.0 2.0 0.0;" "5.0 6.0 0.0 6.0;" "9.0 8.0 4.0 8.0;"); - arma::mat outputT; // assume input is transposed - arma::mat output; // assume input is not transposed + arma::mat outputT; // assume input is column wise + arma::mat output; // assume input is row wise double mappedValue = 0.0; ListwiseDeletion imputer; - // transposed - imputer.Impute(input, outputT, mappedValue, 0, true); // transposed + // column wise + imputer.Impute(input, outputT, mappedValue, 0, true); // column wise BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(0, 1), 2.0, 1e-5); @@ -296,8 +303,8 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5); BOOST_REQUIRE_CLOSE(outputT(2, 1), 4.0, 1e-5); - // not transposed - imputer.Impute(input, output, mappedValue, 1, false); // not transposed + // row wise + imputer.Impute(input, output, mappedValue, 1, false); // row wise BOOST_REQUIRE_CLOSE(output(0, 0), 5.0, 1e-5); BOOST_REQUIRE_CLOSE(output(0, 1), 6.0, 1e-5); @@ -309,7 +316,7 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) BOOST_REQUIRE_CLOSE(output(1, 3), 8.0, 1e-5); // overwrite to the input - imputer.Impute(input, mappedValue, 0, true); // transposed + imputer.Impute(input, mappedValue, 0, true); // column wise BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5); BOOST_REQUIRE_CLOSE(input(0, 1), 2.0, 1e-5); @@ -319,5 +326,4 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) BOOST_REQUIRE_CLOSE(input(2, 1), 4.0, 1e-5); } - BOOST_AUTO_TEST_SUITE_END(); From 9a6dce711728f0beb521ea8bcb6339d15aa5914c Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 18 Jul 2016 13:06:36 +0900 Subject: [PATCH 35/40] shorten imputation tests --- src/mlpack/tests/imputation_test.cpp | 63 ++++++++-------------------- 1 file changed, 17 insertions(+), 46 deletions(-) diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 13b4614b563..50bada0611e 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -24,6 +24,19 @@ using namespace mlpack::data; using namespace std; BOOST_AUTO_TEST_SUITE(ImputationTest); +/** + * Check if two matrixes are equal. + */ +void CheckEqual(const arma::mat& lhs, const arma::mat& rhs) +{ + BOOST_REQUIRE(lhs.n_rows == rhs.n_rows); + BOOST_REQUIRE(lhs.n_cols == rhs.n_cols); + for(size_t i = 0; i != lhs.n_elem; ++i) + { + BOOST_REQUIRE_CLOSE(lhs[i], rhs[i], 1e-5); + } +} + /** * 1. Make sure a CSV is loaded correctly with mappings using MissingPolicy. * 2. Try Imputer object with CustomImputation method to impute data "a". @@ -136,19 +149,7 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) // overwrite to the input imputer.Impute(input, mappedValue, 0/*dimension*/, true); - - BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(0, 1), 99.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(0, 3), 99.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 2), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5); + CheckEqual(input, output); } /** @@ -200,19 +201,7 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest) // overwrite to the input imputer.Impute(input, mappedValue, 0/*dimension*/, true); - - BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(0, 1), 2.5, 1e-5); - BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(0, 3), 2.5, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 2), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5); + CheckEqual(input, output); } /** @@ -263,19 +252,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) // overwrite to the input imputer.Impute(input, mappedValue, 1/*dimension*/, true); - - BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(0, 1), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(0, 3), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 2), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5); + CheckEqual(input, output); } /** @@ -317,13 +294,7 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) // overwrite to the input imputer.Impute(input, mappedValue, 0, true); // column wise - - BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(0, 1), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(1, 1), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(input(2, 1), 4.0, 1e-5); + CheckEqual(input, output); } BOOST_AUTO_TEST_SUITE_END(); From c3aeba1fc8481ff08e5c689907e421f747b913ad Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 18 Jul 2016 13:40:41 +0900 Subject: [PATCH 36/40] optimize preprocess imputer executable --- .../preprocess/preprocess_imputer_main.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index bacc040d570..c25f3a9e22a 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -99,14 +99,20 @@ int main(int argc, char** argv) MissingPolicy policy(missingSet); using MapperType = DatasetMapper; DatasetMapper info(policy); + std::vector dirtyDimensions; Load(inputFile, input, info, true, true); // print how many mapping exist in each dimensions for (size_t i = 0; i < input.n_rows; ++i) { - Log::Info << info.NumMappings(i) << " mappings in dimension " << i << "." + size_t numMappings = info.NumMappings(i); + Log::Info << numMappings << " mappings in dimension " << i << "." << endl; + if (numMappings > 0) + { + dirtyDimensions.push_back(i); + } } // Initialize imputer class @@ -134,6 +140,7 @@ int main(int argc, char** argv) << endl; } + Timer::Start("imputation"); if (CLI::HasParam("dimension")) { // when --dimension is specified, @@ -142,7 +149,7 @@ int main(int argc, char** argv) << "to replace '" << missingValue << "' on dimension " << dimension << "." << endl; - imputer.Impute(input, output, missingValue, dimension); + imputer.Impute(input, missingValue, dimension); } else { @@ -151,16 +158,17 @@ int main(int argc, char** argv) Log::Info << "Performing '" << strategy << "' imputation strategy " << "to replace '" << missingValue << "' on all dimensions." << endl; - for (size_t i = 0; i < input.n_rows; ++i) + for (size_t i : dirtyDimensions) { - imputer.Impute(input, output, missingValue, i); + imputer.Impute(input, missingValue, i); } } + Timer::Stop("imputation"); if (!outputFile.empty()) { Log::Info << "Saving results to '" << outputFile << "'." << endl; - Save(outputFile, output, false); + Save(outputFile, input, false); } } From 028c217057410e2e75691c32bf062202ce5dca3f Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Mon, 18 Jul 2016 15:16:02 +0900 Subject: [PATCH 37/40] fix bugs in imputation test --- src/mlpack/tests/imputation_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 50bada0611e..2e815efed75 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -149,7 +149,7 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) // overwrite to the input imputer.Impute(input, mappedValue, 0/*dimension*/, true); - CheckEqual(input, output); + CheckEqual(input, outputT); } /** @@ -201,7 +201,7 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest) // overwrite to the input imputer.Impute(input, mappedValue, 0/*dimension*/, true); - CheckEqual(input, output); + CheckEqual(input, outputT); } /** @@ -252,7 +252,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) // overwrite to the input imputer.Impute(input, mappedValue, 1/*dimension*/, true); - CheckEqual(input, output); + CheckEqual(input, outputT); } /** @@ -294,7 +294,7 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) // overwrite to the input imputer.Impute(input, mappedValue, 0, true); // column wise - CheckEqual(input, output); + CheckEqual(input, outputT); } BOOST_AUTO_TEST_SUITE_END(); From 03e19a43f28e743485e128ff19056ac5e4b71017 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Fri, 22 Jul 2016 21:33:50 +0900 Subject: [PATCH 38/40] add more comments and delete impute_test.csv --- src/mlpack/core/data/dataset_mapper.hpp | 18 ++++++++++++++++-- .../imputation_methods/custom_imputation.hpp | 1 + src/mlpack/tests/data/impute_test.csv | 3 --- 3 files changed, 17 insertions(+), 5 deletions(-) delete mode 100644 src/mlpack/tests/data/impute_test.csv diff --git a/src/mlpack/core/data/dataset_mapper.hpp b/src/mlpack/core/data/dataset_mapper.hpp index 0001438eb5d..f7f12d2a689 100644 --- a/src/mlpack/core/data/dataset_mapper.hpp +++ b/src/mlpack/core/data/dataset_mapper.hpp @@ -79,9 +79,20 @@ class DatasetMapper typename PolicyType::MappedType UnmapValue(const std::string& string, const size_t dimension); + /** + * MapTokens turns vector of strings into numeric variables and puts them + * into a given matrix. It is uses mapping policy to store categorical values + * to maps. How it determines whether a value is categorical and how it + * stores the categorical value into map and replaces with the numerical value + * all depends on the mapping policy object's MapTokens() funciton. + * + * @tparam eT Type of armadillo matrix. + * @param tokens Vector of variables inside a dimension. + * @param row Position of the given tokens. + * @param matrix Matrix to save the data into. + */ template - void MapTokens(const std::vector& tokens, - size_t& row, + void MapTokens(const std::vector& tokens, size_t& row, arma::Mat& matrix); //! Return the type of a given dimension (numeric or categorical). @@ -134,8 +145,11 @@ class DatasetMapper // MapType = map, numMappings>> using MapType = std::unordered_map>; + //! maps object stores string and numerical pairs. MapType maps; + //! policy object tells dataset mapper how the categorical values should be + // mapped to the maps object. It is used in MapString() and MapTokens(). PolicyType policy; }; diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp index f7d8bdee8ea..03b9d7771ea 100644 --- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -141,6 +141,7 @@ class CustomImputation } private: + //! A user-defined value that the user wants to replace missing values with. T customValue; }; // class CustomImputation diff --git a/src/mlpack/tests/data/impute_test.csv b/src/mlpack/tests/data/impute_test.csv deleted file mode 100644 index 06256a4772d..00000000000 --- a/src/mlpack/tests/data/impute_test.csv +++ /dev/null @@ -1,3 +0,0 @@ -a, 2, 3 -5, 6, a -1, 9, 1 From 5eb9abdca9f82a6cb6e3c54c5aa7647c53f464a4 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Fri, 22 Jul 2016 23:59:48 +0900 Subject: [PATCH 39/40] fix PARAM statements in imputer --- .../preprocess/preprocess_imputer_main.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index c25f3a9e22a..2863b3e65a3 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -27,14 +27,14 @@ PROGRAM_INFO("Impute Data", "This utility takes a dataset and converts user " "column-wise dataset, and save the result to result.csv, we could run" "\n\n" "$ mlpack_preprocess_imputer -i dataset.csv -o result.csv -m NULL -d 0 \n" - "> -s listwise_deletion") - -PARAM_STRING_REQ("input_file", "File containing data,", "i"); -PARAM_STRING("output_file", "File to save output", "o", ""); -PARAM_STRING("missing_value", "User defined missing value", "m", "") -PARAM_STRING("strategy", "imputation strategy to be applied", "s", "") -PARAM_DOUBLE("custom_value", "user_defined custom value", "c", 0.0) -PARAM_INT("dimension", "the dimension to apply imputation", "d", 0); + "> -s listwise_deletion"); + +PARAM_STRING_IN_REQ("input_file", "File containing data,", "i"); +PARAM_STRING_OUT("output_file", "File to save output", "o"); +PARAM_STRING_IN("missing_value", "User defined missing value", "m", ""); +PARAM_STRING_IN("strategy", "imputation strategy to be applied", "s", ""); +PARAM_DOUBLE_IN("custom_value", "user_defined custom value", "c", 0.0); +PARAM_INT_IN("dimension", "the dimension to apply imputation", "d", 0); using namespace mlpack; using namespace arma; From d04323513302b5e039001e45852e79a74aba3740 Mon Sep 17 00:00:00 2001 From: Keon Kim Date: Sat, 23 Jul 2016 12:54:08 +0900 Subject: [PATCH 40/40] delete Impute() overloads that produce output matrix --- .../imputation_methods/custom_imputation.hpp | 74 ----- .../imputation_methods/listwise_deletion.hpp | 44 --- .../imputation_methods/mean_imputation.hpp | 99 ------- .../imputation_methods/median_imputation.hpp | 88 ------ src/mlpack/core/data/imputer.hpp | 19 -- src/mlpack/tests/imputation_test.cpp | 270 ++++++++---------- 6 files changed, 118 insertions(+), 476 deletions(-) diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp index 03b9d7771ea..c8674a3a1fe 100644 --- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -25,80 +25,6 @@ class CustomImputation // nothing to initialize here } - /** - * Impute function searches through the input looking for mappedValue and - * replaces it with the user-defined custom value of the given dimension. - * The result is saved to the output. Custom value must be set when - * initializing the CustomImputation object. - * - * @param input Matrix that contains mappedValue. - * @param output Matrix that the result will be saved into. - * @param mappedValue Value that the user wants to get rid of. - * @param dimension Index of the dimension of the mappedValue. - * @param columnMajor State of whether the input matrix is columnMajor or not. - */ - void Impute(const arma::Mat& input, - arma::Mat& output, - const T& mappedValue, - const size_t dimension, - const bool columnMajor = true) - { - // set size of the output - output.set_size(input.n_rows, input.n_cols); - - // replace the target value to custom value - if (columnMajor) - { - for (size_t row = 0; row < input.n_rows; ++row) - { - for (size_t col = 0; col < input.n_cols; ++col) - { - if (row == dimension) - { - if (input(row, col) == mappedValue || - std::isnan(input(row, col))) - { - output(row, col) = customValue; - } - else - { - output(row, col) = input(row, col); - } - } - else - { - output(row, col) = input(row, col); - } - } - } - } - else - { - for (size_t col = 0; col < input.n_cols; ++ col) - { - for (size_t row = 0; row < input.n_rows; ++row) - { - if (col == dimension) - { - if (input(row, col) == mappedValue || - std::isnan(input(row, col))) - { - output(row, col) = customValue; - } - else - { - output(row, col) = input(row, col); - } - } - else - { - output(row, col) = input(row, col); - } - } - } - } - } - /** * Impute function searches through the input looking for mappedValue and * replaces it with the user-defined custom value of the given dimension. diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp index 0ac84ae110e..36eeeeb58a0 100644 --- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp +++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp @@ -20,50 +20,6 @@ template class ListwiseDeletion { public: - /** - * Impute function searches through the input looking for mappedValue and - * remove the whole row or column. The result is saved to the output. - * - * @param input Matrix that contains mappedValue. - * @param output Matrix that the result will be saved into. - * @param mappedValue Value that the user wants to get rid of. - * @param dimension Index of the dimension of the mappedValue. - * @param columnMajor State of whether the input matrix is columnMajor or not. - */ - void Impute(const arma::Mat& input, - arma::Mat& output, - const T& mappedValue, - const size_t dimension, - const bool columnMajor = true) - { - std::vector colsToKeep; - - if (columnMajor) - { - for (size_t i = 0; i < input.n_cols; ++i) - { - if (!(input(dimension, i) == mappedValue || - std::isnan(input(dimension, i)))) - { - colsToKeep.push_back(i); - } - } - output = input.cols(arma::uvec(colsToKeep)); - } - else - { - for (size_t i = 0; i < input.n_rows; ++i) - { - if (!(input(i, dimension) == mappedValue || - std::isnan(input(i, dimension)))) - { - colsToKeep.push_back(i); - } - } - output = input.rows(arma::uvec(colsToKeep)); - } - } - /** * Impute function searches through the input looking for mappedValue and * remove the whole row or column. The result is overwritten to the input. diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp index b276ca8cbfb..e7a955ea563 100644 --- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -19,105 +19,6 @@ template class MeanImputation { public: - /** - * Impute function searches through the input looking for mappedValue and - * replaces it with the mean of the given dimension. The result is saved - * to the output. - * - * @param input Matrix that contains mappedValue. - * @param output Matrix that the result will be saved into. - * @param mappedValue Value that the user wants to get rid of. - * @param dimension Index of the dimension of the mappedValue. - * @param columnMajor State of whether the input matrix is columnMajor or not. - */ - void Impute(const arma::Mat& input, - arma::Mat& output, - const T& mappedValue, - const size_t dimension, - const bool columnMajor = true) - { - // set size of the output - output.set_size(input.n_rows, input.n_cols); - - double sum = 0; - size_t elems = 0; // excluding nan or missing target - - using PairType = std::pair; - // dimensions and indexes are saved as pairs inside this vector. - std::vector targets; - - // calculate number of elements and sum of them excluding mapped value or - // nan. while doing that, remember where mappedValue or NaN exists. - if (columnMajor) - { - for (size_t row = 0; row < input.n_rows; ++row) - { - for (size_t col = 0; col < input.n_cols; ++col) - { - if (row == dimension) - { - if (input(row, col) == mappedValue || - std::isnan(input(row, col))) - { - targets.emplace_back(row, col); - } - else - { - elems++; - sum += input(row, col); - output(row, col) = input(row, col); - } - } - else - { - output(row, col) = input(row, col); - } - } - } - } - else - { - for (size_t col = 0; col < input.n_cols; ++col) - { - for (size_t row = 0; row < input.n_rows; ++row) - { - if (col == dimension) - { - if (input(row, col) == mappedValue || - std::isnan(input(row, col))) - { - targets.emplace_back(row, col); - } - else - { - elems++; - sum += input(row, col); - output(row, col) = input(row, col); - } - } - else - { - output(row, col) = input(row, col); - } - } - } - } - - if (elems == 0) - Log::Fatal << "it is impossible to calculate mean; no valid elements in " - << "the dimension" << std::endl; - - // calculate mean; - const double mean = sum / elems; - - // Now replace the calculated mean to the missing variables - // It only needs to loop through targets vector, not the whole matrix. - for (const PairType& target : targets) - { - output(target.first, target.second) = mean; - } - } - /** * Impute function searches through the input looking for mappedValue and * replaces it with the mean of the given dimension. The result is overwritten diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp index 658816e65a4..828d22ae6db 100644 --- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -20,94 +20,6 @@ template class MedianImputation { public: - /** - * Impute function searches through the input looking for mappedValue and - * replaces it with the median of the given dimension. The result is saved - * to the output. - * - * @param input Matrix that contains mappedValue. - * @param output Matrix that the result will be saved into. - * @param mappedValue Value that the user wants to get rid of. - * @param dimension Index of the dimension of the mappedValue. - * @param columnMajor State of whether the input matrix is columnMajor or not. - */ - void Impute(const arma::Mat& input, - arma::Mat& output, - const T& mappedValue, - const size_t dimension, - const bool columnMajor = true) - { - // set size of the output - output.set_size(input.n_rows, input.n_cols); - - using PairType = std::pair; - // dimensions and indexes are saved as pairs inside this vector. - std::vector targets; - // good elements are kept inside this vector. - std::vector elemsToKeep; - - if (columnMajor) - { - for (size_t row = 0; row < input.n_rows; ++row) - { - for (size_t col = 0; col < input.n_cols; ++col) - { - if (row == dimension) - { - if (input(row, col) == mappedValue || - std::isnan(input(row, col))) - { - targets.emplace_back(row, col); - } - else - { - elemsToKeep.push_back(input(row, col)); - output(row, col) = input(row, col); - } - } - else - { - output(row, col) = input(row, col); - } - } - } - } - else - { - for (size_t col = 0; col < input.n_cols; ++col) - { - for (size_t row = 0; row < input.n_rows; ++row) - { - if (col == dimension) - { - if (input(row, col) == mappedValue || - std::isnan(input(row, col))) - { - targets.emplace_back(row, col); - } - else - { - elemsToKeep.push_back(input(row, col)); - output(row, col) = input(row, col); - } - } - else - { - output(row, col) = input(row, col); - } - } - } - } - - // calculate median - const double median = arma::median(arma::vec(elemsToKeep)); - - for (const PairType& target : targets) - { - output(target.first, target.second) = median; - } - } - /** * Impute function searches through the input looking for mappedValue and * replaces it with the median of the given dimension. The result is diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp index f6134a7bb25..ea1ac68fbf5 100644 --- a/src/mlpack/core/data/imputer.hpp +++ b/src/mlpack/core/data/imputer.hpp @@ -43,25 +43,6 @@ class Imputer // Nothing to initialize here. } - /** - * Given an input dataset, replace missing values with given imputation - * strategy. This overload saves the result into the output matrix and does not - * change the input matrix. - * - * @param input Input dataset to apply imputation. - * @param output Armadillo matrix to save the results - * @oaran missingValue User defined missing value; it can be anything. - * @param dimension Dimension to apply the imputation. - */ - void Impute(const arma::Mat& input, - arma::Mat& output, - const std::string& missingValue, - const size_t dimension) - { - T mappedValue = static_cast(mapper.UnmapValue(missingValue, dimension)); - strategy.Impute(input, output, mappedValue, dimension, columnMajor); - } - /** * Given an input dataset, replace missing values with given imputation * strategy. This overload does not produce output matrix, but overwrites the diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 2e815efed75..ce48ad0bddb 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -24,19 +24,6 @@ using namespace mlpack::data; using namespace std; BOOST_AUTO_TEST_SUITE(ImputationTest); -/** - * Check if two matrixes are equal. - */ -void CheckEqual(const arma::mat& lhs, const arma::mat& rhs) -{ - BOOST_REQUIRE(lhs.n_rows == rhs.n_rows); - BOOST_REQUIRE(lhs.n_cols == rhs.n_cols); - for(size_t i = 0; i != lhs.n_elem; ++i) - { - BOOST_REQUIRE_CLOSE(lhs[i], rhs[i], 1e-5); - } -} - /** * 1. Make sure a CSV is loaded correctly with mappings using MissingPolicy. * 2. Try Imputer object with CustomImputation method to impute data "a". @@ -53,7 +40,6 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) f.close(); arma::mat input; - arma::mat output; std::set mset; mset.insert("a"); @@ -83,18 +69,18 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) DatasetMapper, CustomImputation> imputer(info, customStrategy); // convert a or nan to 99 for dimension 0 - imputer.Impute(input, output, "a", 0); + imputer.Impute(input, "a", 0); // Custom imputation result check - BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 0), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5); - BOOST_REQUIRE(std::isnan(output(2, 1)) == true); // remains as NaN - BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 0), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 1), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 2), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 0), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 2), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 0), 3.0, 1e-5); + BOOST_REQUIRE(std::isnan(input(2, 1)) == true); // remains as NaN + BOOST_REQUIRE_CLOSE(input(2, 2), 10.0, 1e-5); // Remove the file. remove("test_file.csv"); @@ -105,51 +91,46 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) */ BOOST_AUTO_TEST_CASE(CustomImputationTest) { - arma::mat input("3.0 0.0 2.0 0.0;" + arma::mat columnWiseInput("3.0 0.0 2.0 0.0;" "5.0 6.0 0.0 6.0;" "9.0 8.0 4.0 8.0;"); - arma::mat outputT; // assume input is column wise - arma::mat output; // assume input is row wise + arma::mat rowWiseInput(columnWiseInput); double customValue = 99; double mappedValue = 0.0; CustomImputation imputer(customValue); // column wise - imputer.Impute(input, outputT, mappedValue, 0/*dimension*/, true); - - BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(0, 1), 99.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(0, 2), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(0, 3), 99.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 2), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 3), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 1), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 2), 4.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); + imputer.Impute(columnWiseInput, mappedValue, 0/*dimension*/, true); + + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 1), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 3), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 3), 8.0, 1e-5); // row wise - imputer.Impute(input, output, mappedValue, 1, false); - - BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 1), 99.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 2), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 3), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 2), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 3), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5); - - // overwrite to the input - imputer.Impute(input, mappedValue, 0/*dimension*/, true); - CheckEqual(input, outputT); + imputer.Impute(rowWiseInput, mappedValue, 1, false); + + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 1), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 3), 8.0, 1e-5); } /** @@ -158,50 +139,45 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest) */ BOOST_AUTO_TEST_CASE(MeanImputationTest) { - arma::mat input("3.0 0.0 2.0 0.0;" + arma::mat columnWiseInput("3.0 0.0 2.0 0.0;" "5.0 6.0 0.0 6.0;" "9.0 8.0 4.0 8.0;"); - arma::mat outputT; // assume input is column wise - arma::mat output; // assume input is row wise + arma::mat rowWiseInput(columnWiseInput); double mappedValue = 0.0; MeanImputation imputer; // column wise - imputer.Impute(input, outputT, mappedValue, 0, true); - - BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(0, 1), 2.5, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(0, 2), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(0, 3), 2.5, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 2), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 3), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 1), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 2), 4.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); + imputer.Impute(columnWiseInput, mappedValue, 0, true); + + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 1), 2.5, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 3), 2.5, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 3), 8.0, 1e-5); // row wise - imputer.Impute(input, output, mappedValue, 1, false); - - BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 1), 7.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 2), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 3), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 2), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 3), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5); - - // overwrite to the input - imputer.Impute(input, mappedValue, 0/*dimension*/, true); - CheckEqual(input, outputT); + imputer.Impute(rowWiseInput, mappedValue, 1, false); + + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 1), 7.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 3), 8.0, 1e-5); } /** @@ -210,49 +186,44 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest) */ BOOST_AUTO_TEST_CASE(MedianImputationTest) { - arma::mat input("3.0 0.0 2.0 0.0;" + arma::mat columnWiseInput("3.0 0.0 2.0 0.0;" "5.0 6.0 0.0 6.0;" "9.0 8.0 4.0 8.0;"); - arma::mat outputT; // assume input is column wise - arma::mat output; // assume input is row wise + arma::mat rowWiseInput(columnWiseInput); double mappedValue = 0.0; MedianImputation imputer; // column wise - imputer.Impute(input, outputT, mappedValue, 1, true); - - BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(0, 1), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(0, 2), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(0, 3), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 2), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 3), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 1), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 2), 4.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5); + imputer.Impute(columnWiseInput, mappedValue, 1, true); + + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 1), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 2), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 3), 8.0, 1e-5); // row wise - imputer.Impute(input, output, mappedValue, 1, false); - - BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 1), 7.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 2), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 3), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 2), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 3), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5); - - // overwrite to the input - imputer.Impute(input, mappedValue, 1/*dimension*/, true); - CheckEqual(input, outputT); + imputer.Impute(rowWiseInput, mappedValue, 1, false); + + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 1), 7.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 2), 4.0, 1e-5); } /** @@ -261,40 +232,35 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest) */ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) { - arma::mat input("3.0 0.0 2.0 0.0;" + arma::mat columnWiseInput("3.0 0.0 2.0 0.0;" "5.0 6.0 0.0 6.0;" "9.0 8.0 4.0 8.0;"); - arma::mat outputT; // assume input is column wise - arma::mat output; // assume input is row wise + arma::mat rowWiseInput(columnWiseInput); double mappedValue = 0.0; ListwiseDeletion imputer; // column wise - imputer.Impute(input, outputT, mappedValue, 0, true); // column wise + imputer.Impute(columnWiseInput, mappedValue, 0, true); // column wise - BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(0, 1), 2.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(1, 1), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(outputT(2, 1), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 1), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 1), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 1), 4.0, 1e-5); // row wise - imputer.Impute(input, output, mappedValue, 1, false); // row wise - - BOOST_REQUIRE_CLOSE(output(0, 0), 5.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 1), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 2), 0.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(0, 3), 6.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 0), 9.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 1), 8.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 2), 4.0, 1e-5); - BOOST_REQUIRE_CLOSE(output(1, 3), 8.0, 1e-5); - - // overwrite to the input - imputer.Impute(input, mappedValue, 0, true); // column wise - CheckEqual(input, outputT); + imputer.Impute(rowWiseInput, mappedValue, 1, false); // row wise + + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 3), 8.0, 1e-5); } BOOST_AUTO_TEST_SUITE_END();