diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index f11f19cc8c6..2fbd5d32e62 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -1,8 +1,8 @@ # Define the files that we need to compile. # Anything not in this list will not be compiled into mlpack. set(SOURCES - dataset_info.hpp - dataset_info_impl.hpp + dataset_mapper.hpp + dataset_mapper_impl.hpp extension.hpp format.hpp load.hpp @@ -15,6 +15,7 @@ set(SOURCES save_impl.hpp serialization_shim.hpp split_data.hpp + imputer.hpp binarize.hpp ) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp deleted file mode 100644 index 29c7cee8be4..00000000000 --- a/src/mlpack/core/data/dataset_info.hpp +++ /dev/null @@ -1,114 +0,0 @@ -/** - * @file dataset_info.hpp - * @author Ryan Curtin - * - * Defines the DatasetInfo class, which holds information about a dataset. This - * is useful when the dataset contains categorical non-numeric features that - * needs to be mapped to categorical numeric features. - */ -#ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP -#define MLPACK_CORE_DATA_DATASET_INFO_HPP - -#include -#include -#include - -namespace mlpack { -namespace data { - -/** - * The Datatype enum specifies the types of data mlpack algorithms can use. The - * vast majority of mlpack algorithms can only use numeric data (i.e. - * float/double/etc.), but some algorithms can use categorical data, specified - * via this Datatype enum and the DatasetInfo class. - */ -enum Datatype : bool /* bool is all the precision we need for two types */ -{ - numeric = 0, - categorical = 1 -}; - -/** - * Auxiliary information for a dataset, including mappings to/from strings and - * the datatype of each dimension. DatasetInfo objects are optionally produced - * by data::Load(), and store the type of each dimension (Datatype::numeric or - * Datatype::categorical) as well as mappings from strings to unsigned integers - * and vice versa. - */ -class DatasetInfo -{ - public: - /** - * Create the DatasetInfo object with the given dimensionality. Note that the - * dimensionality cannot be changed later; you will have to create a new - * DatasetInfo object. - */ - DatasetInfo(const size_t dimensionality = 0); - - /** - * Given the string and the dimension to which it belongs, return its numeric - * mapping. If no mapping yet exists, the string is added to the list of - * mappings for the given dimension. The dimension parameter refers to the - * index of the dimension of the string (i.e. the row in the dataset). - * - * @param string String to find/create mapping for. - * @param dimension Index of the dimension of the string. - */ - size_t MapString(const std::string& string, const size_t dimension); - - /** - * Return the string that corresponds to a given value in a given dimension. - * If the string is not a valid mapping in the given dimension, a - * std::invalid_argument is thrown. - * - * @param value Mapped value for string. - * @param dimension Dimension to unmap string from. - */ - const std::string& UnmapString(const size_t value, const size_t dimension); - - //! Return the type of a given dimension (numeric or categorical). - Datatype Type(const size_t dimension) const; - //! Modify the type of a given dimension (be careful!). - Datatype& Type(const size_t dimension); - - /** - * Get the number of mappings for a particular dimension. If the dimension - * is numeric, then this will return 0. - */ - size_t NumMappings(const size_t dimension) const; - - /** - * Get the dimensionality of the DatasetInfo object (that is, how many - * dimensions it has information for). If this object was created by a call - * to mlpack::data::Load(), then the dimensionality will be the same as the - * number of rows (dimensions) in the dataset. - */ - size_t Dimensionality() const; - - /** - * Serialize the dataset information. - */ - template - void Serialize(Archive& ar, const unsigned int /* version */) - { - ar & data::CreateNVP(types, "types"); - ar & data::CreateNVP(maps, "maps"); - } - - private: - //! Types of each dimension. - std::vector types; - - //! Mappings from strings to integers. Map entries will only exist for - //! dimensions that are categorical. - std::unordered_map, - size_t>> maps; - -}; - -} // namespace data -} // namespace mlpack - -#include "dataset_info_impl.hpp" - -#endif diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp deleted file mode 100644 index a3ee24dc576..00000000000 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ /dev/null @@ -1,100 +0,0 @@ -/** - * @file dataset_info_impl.hpp - * @author Ryan Curtin - * - * An implementation of the DatasetInfo class. - */ -#ifndef MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP -#define MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP - -// In case it hasn't already been included. -#include "dataset_info.hpp" - -namespace mlpack { -namespace data { - -// Default constructor. -inline DatasetInfo::DatasetInfo(const size_t dimensionality) : - types(dimensionality, Datatype::numeric) -{ - // Nothing to initialize. -} - -// Map the string to a numeric id. -inline size_t DatasetInfo::MapString(const std::string& string, - const size_t dimension) -{ - // If this condition is true, either we have no mapping for the given string - // or we have no mappings for the given dimension at all. In either case, - // we create a mapping. - if (maps.count(dimension) == 0 || - maps[dimension].first.left.count(string) == 0) - { - // This string does not exist yet. - size_t& numMappings = maps[dimension].second; - if (numMappings == 0) - types[dimension] = Datatype::categorical; - typedef boost::bimap::value_type PairType; - maps[dimension].first.insert(PairType(string, numMappings)); - return numMappings++; - } - else - { - // This string already exists in the mapping. - return maps[dimension].first.left.at(string); - } -} - -// Return the string corresponding to a value in a given dimension. -inline const std::string& DatasetInfo::UnmapString( - const size_t value, - const size_t dimension) -{ - // Throw an exception if the value doesn't exist. - if (maps[dimension].first.right.count(value) == 0) - { - std::ostringstream oss; - oss << "DatasetInfo::UnmapString(): value '" << value << "' unknown for " - << "dimension " << dimension; - throw std::invalid_argument(oss.str()); - } - - return maps[dimension].first.right.at(value); -} - -// Get the type of a particular dimension. -inline Datatype DatasetInfo::Type(const size_t dimension) const -{ - if (dimension >= types.size()) - { - std::ostringstream oss; - oss << "requested type of dimension " << dimension << ", but dataset only " - << "has " << types.size() << " dimensions"; - throw std::invalid_argument(oss.str()); - } - - return types[dimension]; -} - -inline Datatype& DatasetInfo::Type(const size_t dimension) -{ - if (dimension >= types.size()) - types.resize(dimension + 1, Datatype::numeric); - - return types[dimension]; -} - -inline size_t DatasetInfo::NumMappings(const size_t dimension) const -{ - return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second; -} - -inline size_t DatasetInfo::Dimensionality() const -{ - return types.size(); -} - -} // namespace data -} // namespace mlpack - -#endif diff --git a/src/mlpack/core/data/dataset_mapper.hpp b/src/mlpack/core/data/dataset_mapper.hpp new file mode 100644 index 00000000000..f7f12d2a689 --- /dev/null +++ b/src/mlpack/core/data/dataset_mapper.hpp @@ -0,0 +1,164 @@ +/** + * @file dataset_mapper.hpp + * @author Ryan Curtin + * @author Keon Kim + * + * Defines the DatasetMapper class, which holds information about a dataset. + * This is useful when the dataset contains categorical non-numeric features + * that needs to be mapped to categorical numeric features. + */ +#ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP +#define MLPACK_CORE_DATA_DATASET_INFO_HPP + +#include +#include +#include + +#include "map_policies/increment_policy.hpp" + +namespace mlpack { +namespace data { +/** + * Auxiliary information for a dataset, including mappings to/from strings and + * the datatype of each dimension. DatasetMapper objects are optionally + * produced by data::Load(), and store the type of each dimension + * (Datatype::numeric or Datatype::categorical) as well as mappings from strings + * to unsigned integers and vice versa. + * + * @tparam PolicyType Mapping policy used to specify MapString(); + */ +template +class DatasetMapper +{ + public: + /** + * Create the DatasetMapper object with the given dimensionality. Note that + * the dimensionality cannot be changed later; you will have to create a new + * DatasetMapper object. + */ + explicit DatasetMapper(const size_t dimensionality = 0); + + /** + * Create the DatasetMapper object with the given policy and dimensionality. + * Note that the dimensionality cannot be changed later; you will have to + * create a new DatasetMapper object. Policy can be modified by the modifier. + */ + explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0); + + /** + * Given the string and the dimension to which it belongs, return its numeric + * mapping. If no mapping yet exists, the string is added to the list of + * mappings for the given dimension. The dimension parameter refers to the + * index of the dimension of the string (i.e. the row in the dataset). + * + * @param string String to find/create mapping for. + * @param dimension Index of the dimension of the string. + */ + typename PolicyType::MappedType MapString(const std::string& string, + const size_t dimension); + + /** + * Return the string that corresponds to a given value in a given dimension. + * If the string is not a valid mapping in the given dimension, a + * std::invalid_argument is thrown. + * + * @param value Mapped value for string. + * @param dimension Dimension to unmap string from. + */ + const std::string& UnmapString(const size_t value, const size_t dimension); + + + /** + * Return the value that corresponds to a given string in a given dimension. + * If the value is not a valid mapping in the given dimension, a + * std::invalid_argument is thrown. + * + * @param string Mapped string for value. + * @param dimension Dimension to unmap string from. + */ + typename PolicyType::MappedType UnmapValue(const std::string& string, + const size_t dimension); + + /** + * MapTokens turns vector of strings into numeric variables and puts them + * into a given matrix. It is uses mapping policy to store categorical values + * to maps. How it determines whether a value is categorical and how it + * stores the categorical value into map and replaces with the numerical value + * all depends on the mapping policy object's MapTokens() funciton. + * + * @tparam eT Type of armadillo matrix. + * @param tokens Vector of variables inside a dimension. + * @param row Position of the given tokens. + * @param matrix Matrix to save the data into. + */ + template + void MapTokens(const std::vector& tokens, size_t& row, + arma::Mat& matrix); + + //! Return the type of a given dimension (numeric or categorical). + Datatype Type(const size_t dimension) const; + //! Modify the type of a given dimension (be careful!). + Datatype& Type(const size_t dimension); + + /** + * Get the number of mappings for a particular dimension. If the dimension + * is numeric, then this will return 0. + */ + size_t NumMappings(const size_t dimension) const; + + /** + * Get the dimensionality of the DatasetMapper object (that is, how many + * dimensions it has information for). If this object was created by a call + * to mlpack::data::Load(), then the dimensionality will be the same as the + * number of rows (dimensions) in the dataset. + */ + size_t Dimensionality() const; + + /** + * Serialize the dataset information. + */ + template + void Serialize(Archive& ar, const unsigned int /* version */) + { + ar & data::CreateNVP(types, "types"); + ar & data::CreateNVP(maps, "maps"); + } + + //! Return the policy of the mapper. + const PolicyType& Policy() const; + + //! Modify the policy of the mapper (be careful!). + PolicyType& Policy(); + + //! Modify (Replace) the policy of the mapper with a new policy + void Policy(PolicyType&& policy); + + private: + //! Types of each dimension. + std::vector types; + + // BiMapType definition + using BiMapType = boost::bimap; + + // Mappings from strings to integers. + // Map entries will only exist for dimensions that are categorical. + // MapType = map, numMappings>> + using MapType = std::unordered_map>; + + //! maps object stores string and numerical pairs. + MapType maps; + + //! policy object tells dataset mapper how the categorical values should be + // mapped to the maps object. It is used in MapString() and MapTokens(). + PolicyType policy; +}; + +// Use typedef to provide backward compatibility +using DatasetInfo = DatasetMapper; + +} // namespace data +} // namespace mlpack + +#include "dataset_mapper_impl.hpp" + +#endif diff --git a/src/mlpack/core/data/dataset_mapper_impl.hpp b/src/mlpack/core/data/dataset_mapper_impl.hpp new file mode 100644 index 00000000000..6b291e2d368 --- /dev/null +++ b/src/mlpack/core/data/dataset_mapper_impl.hpp @@ -0,0 +1,151 @@ +/** + * @file dataset_mapper_impl.hpp + * @author Ryan Curtin + * @author Keon Kim + * + * An implementation of the DatasetMapper class. + */ +#ifndef MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP +#define MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP + +// In case it hasn't already been included. +#include "dataset_mapper.hpp" + +namespace mlpack { +namespace data { + +// Default constructor. +template +inline DatasetMapper::DatasetMapper(const size_t dimensionality) : + types(dimensionality, Datatype::numeric) +{ + // Nothing to initialize here. +} + +template +inline DatasetMapper::DatasetMapper(PolicyType& policy, + const size_t dimensionality) : + types(dimensionality, Datatype::numeric), + policy(std::move(policy)) +{ + // Nothing to initialize here. +} + +// When we want to insert value into the map, +// we could use the policy to map the string +template +inline typename PolicyType::MappedType DatasetMapper::MapString( + const std::string& string, + const size_t dimension) +{ + return policy.template MapString(string, dimension, maps, types); +} + +// Return the string corresponding to a value in a given dimension. +template +inline const std::string& DatasetMapper::UnmapString( + const size_t value, + const size_t dimension) +{ + // Throw an exception if the value doesn't exist. + if (maps[dimension].first.right.count(value) == 0) + { + std::ostringstream oss; + oss << "DatasetMapper::UnmapString(): value '" << value + << "' unknown for dimension " << dimension; + throw std::invalid_argument(oss.str()); + } + + return maps[dimension].first.right.at(value); +} + +// Return the value corresponding to a string in a given dimension. +template +inline typename PolicyType::MappedType DatasetMapper::UnmapValue( + const std::string& string, + const size_t dimension) +{ + // Throw an exception if the value doesn't exist. + if (maps[dimension].first.left.count(string) == 0) + { + std::ostringstream oss; + oss << "DatasetMapper::UnmapValue(): string '" << string + << "' unknown for dimension " << dimension; + throw std::invalid_argument(oss.str()); + } + + return maps[dimension].first.left.at(string); +} + +template +template +inline void DatasetMapper::MapTokens( + const std::vector& tokens, + size_t& row, + arma::Mat& matrix) +{ + return policy.template MapTokens(tokens, row, matrix, maps, + types); +} + +// Get the type of a particular dimension. +template +inline Datatype DatasetMapper::Type(const size_t dimension) const +{ + if (dimension >= types.size()) + { + std::ostringstream oss; + oss << "requested type of dimension " << dimension << ", but dataset only " + << "has " << types.size() << " dimensions"; + throw std::invalid_argument(oss.str()); + } + + return types[dimension]; +} + +template +inline Datatype& DatasetMapper::Type(const size_t dimension) +{ + if (dimension >= types.size()) + types.resize(dimension + 1, Datatype::numeric); + + return types[dimension]; +} + +template +inline +size_t DatasetMapper::NumMappings(const size_t dimension) const +{ + return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second; +} + +template +inline size_t DatasetMapper::Dimensionality() const +{ + return types.size(); +} + +template +inline const PolicyType& DatasetMapper::Policy() const +{ + return this->policy; +} + +template +inline PolicyType& DatasetMapper::Policy() +{ + return this->policy; +} + +template +inline void DatasetMapper::Policy(PolicyType&& policy) +{ + this->policy = std::forward(policy); +} + + + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/imputation_methods/CMakeLists.txt b/src/mlpack/core/data/imputation_methods/CMakeLists.txt new file mode 100644 index 00000000000..a3993192ec2 --- /dev/null +++ b/src/mlpack/core/data/imputation_methods/CMakeLists.txt @@ -0,0 +1,17 @@ +# Define the files we need to compile +# Anything not in this list will not be compiled into mlpack. +set(SOURCES + custom_imputation.hpp + listwise_deletion.hpp + mean_imputation.hpp + median_imputation.hpp +) + +# Add directory name to sources. +set(DIR_SRCS) +foreach(file ${SOURCES}) + set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) +endforeach() +# Append sources (with directory name) to list of all mlpack sources (used at +# the parent scope). +set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp new file mode 100644 index 00000000000..c8674a3a1fe --- /dev/null +++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp @@ -0,0 +1,77 @@ +/** + * @file custom_imputation.hpp + * @author Keon Kim + * + * Definition and Implementation of the empty CustomImputation class. + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_IMPUTATION_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_IMPUTATION_HPP + +#include + +namespace mlpack { +namespace data { +/** + * A simple custom imputation class + * @tparam T Type of armadillo matrix + */ +template +class CustomImputation +{ + public: + CustomImputation(T customValue): + customValue(std::move(customValue)) + { + // nothing to initialize here + } + + /** + * Impute function searches through the input looking for mappedValue and + * replaces it with the user-defined custom value of the given dimension. + * The result is overwritten to the input, not creating any copy. Custom value + * must be set when initializing the CustomImputation object. + * + * @param input Matrix that contains mappedValue. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param columnMajor State of whether the input matrix is columnMajor or not. + */ + void Impute(arma::Mat& input, + const T& mappedValue, + const size_t dimension, + const bool columnMajor = true) + { + // replace the target value to custom value + if (columnMajor) + { + for (size_t i = 0; i < input.n_cols; ++i) + { + if (input(dimension, i) == mappedValue || + std::isnan(input(dimension, i))) + { + input(dimension, i) = customValue; + } + } + } + else + { + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(i, dimension) == mappedValue || + std::isnan(input(i, dimension))) + { + input(i, dimension) = customValue; + } + } + } + } + + private: + //! A user-defined value that the user wants to replace missing values with. + T customValue; +}; // class CustomImputation + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp new file mode 100644 index 00000000000..36eeeeb58a0 --- /dev/null +++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp @@ -0,0 +1,69 @@ +/** + * @file listwise_deletion.hpp + * @author Keon Kim + * + * Definition and Implementation of the empty ListwiseDeletion class. + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_LISTWISE_DELETION_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_LISTWISE_DELETION_HPP + +#include + +namespace mlpack { +namespace data { +/** + * A complete-case analysis to remove the values containing mappedValue. + * Removes all data for a case that has one or more missing values. + * @tparam T Type of armadillo matrix + */ +template +class ListwiseDeletion +{ + public: + /** + * Impute function searches through the input looking for mappedValue and + * remove the whole row or column. The result is overwritten to the input. + * + * @param input Matrix that contains mappedValue. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param columnMajor State of whether the input matrix is columnMajor or not. + */ + void Impute(arma::Mat& input, + const T& mappedValue, + const size_t dimension, + const bool columnMajor = true) + { + std::vector colsToKeep; + + if (columnMajor) + { + for (size_t i = 0; i < input.n_cols; ++i) + { + if (!(input(dimension, i) == mappedValue || + std::isnan(input(dimension, i)))) + { + colsToKeep.push_back(i); + } + } + input = input.cols(arma::uvec(colsToKeep)); + } + else + { + for (size_t i = 0; i < input.n_rows; ++i) + { + if (!(input(i, dimension) == mappedValue || + std::isnan(input(i, dimension)))) + { + colsToKeep.push_back(i); + } + } + input = input.rows(arma::uvec(colsToKeep)); + } + } +}; // class ListwiseDeletion + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp new file mode 100644 index 00000000000..e7a955ea563 --- /dev/null +++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp @@ -0,0 +1,99 @@ +/** + * @file mean_imputation.hpp + * @author Keon Kim + * + * Definition and Implementation of the MeanImputation class. + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_IMPUTATION_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_IMPUTATION_HPP + +#include + +namespace mlpack { +namespace data { +/** + * A simple mean imputation class + * @tparam T Type of armadillo matrix + */ +template +class MeanImputation +{ + public: + /** + * Impute function searches through the input looking for mappedValue and + * replaces it with the mean of the given dimension. The result is overwritten + * to the input matrix. + * + * @param input Matrix that contains mappedValue. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param columnMajor State of whether the input matrix is columnMajor or not. + */ + void Impute(arma::Mat& input, + const T& mappedValue, + const size_t dimension, + const bool columnMajor = true) + { + double sum = 0; + size_t elems = 0; // excluding nan or missing target + + using PairType = std::pair; + // dimensions and indexes are saved as pairs inside this vector. + std::vector targets; + + + // calculate number of elements and sum of them excluding mapped value or + // nan. while doing that, remember where mappedValue or NaN exists. + if (columnMajor) + { + for (size_t i = 0; i < input.n_cols; ++i) + { + if (input(dimension, i) == mappedValue || + std::isnan(input(dimension, i))) + { + targets.emplace_back(dimension, i); + } + else + { + elems++; + sum += input(dimension, i); + } + } + } + else + { + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(i, dimension) == mappedValue || + std::isnan(input(i, dimension))) + { + targets.emplace_back(i, dimension); + } + else + { + elems++; + sum += input(i, dimension); + } + } + } + + if (elems == 0) + Log::Fatal << "it is impossible to calculate mean; no valid elements in " + << "the dimension" << std::endl; + + // calculate mean; + const double mean = sum / elems; + + // Now replace the calculated mean to the missing variables + // It only needs to loop through targets vector, not the whole matrix. + for (const PairType& target : targets) + { + input(target.first, target.second) = mean; + } + } +}; // class MeanImputation + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp new file mode 100644 index 00000000000..828d22ae6db --- /dev/null +++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp @@ -0,0 +1,88 @@ +/** + * @file median_imputation.hpp + * @author Keon Kim + * + * Definition and Implementation of the MedianImputation class. + */ +#ifndef MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEDIAN_IMPUTATION_HPP +#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEDIAN_IMPUTATION_HPP + +#include + +namespace mlpack { +namespace data { +/** + * This is a class implementation of simple median imputation. + * replace missing value with middle or average of middle values + * @tparam T Type of armadillo matrix + */ +template +class MedianImputation +{ + public: + /** + * Impute function searches through the input looking for mappedValue and + * replaces it with the median of the given dimension. The result is + * overwritten to the input matrix. + * + * @param input Matrix that contains mappedValue. + * @param mappedValue Value that the user wants to get rid of. + * @param dimension Index of the dimension of the mappedValue. + * @param columnMajor State of whether the input matrix is columnMajor or not. + */ + void Impute(arma::Mat& input, + const T& mappedValue, + const size_t dimension, + const bool columnMajor = true) + { + using PairType = std::pair; + // dimensions and indexes are saved as pairs inside this vector. + std::vector targets; + // good elements are kept inside this vector. + std::vector elemsToKeep; + + if (columnMajor) + { + for (size_t i = 0; i < input.n_cols; ++i) + { + if (input(dimension, i) == mappedValue || + std::isnan(input(dimension, i))) + { + targets.emplace_back(dimension, i); + } + else + { + elemsToKeep.push_back(input(dimension, i)); + } + } + } + else + { + for (size_t i = 0; i < input.n_rows; ++i) + { + if (input(i, dimension) == mappedValue || + std::isnan(input(i, dimension))) + { + targets.emplace_back(i, dimension); + } + else + { + elemsToKeep.push_back(input(i, dimension)); + } + } + } + + // calculate median + const double median = arma::median(arma::vec(elemsToKeep)); + + for (const PairType& target : targets) + { + input(target.first, target.second) = median; + } + } +}; // class MedianImputation + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp new file mode 100644 index 00000000000..ea1ac68fbf5 --- /dev/null +++ b/src/mlpack/core/data/imputer.hpp @@ -0,0 +1,90 @@ +/** + * @file imputer.hpp + * @author Keon Kim + * + * Defines Imputer class a utility function to replace missing variables in a + * dataset. + */ +#ifndef MLPACK_CORE_DATA_IMPUTER_HPP +#define MLPACK_CORE_DATA_IMPUTER_HPP + +#include +#include "dataset_mapper.hpp" +#include "map_policies/missing_policy.hpp" +#include "map_policies/increment_policy.hpp" + +namespace mlpack { +namespace data { + +/** + * Given a dataset of a particular datatype, replace user-specified missing + * value with a variable dependent on the StrategyType and MapperType. + * + * @tparam T Type of armadillo matrix used for imputation strategy. + * @tparam MapperType DatasetMapper that is used to hold dataset information. + * @tparam StrategyType Imputation strategy used. + */ +template +class Imputer +{ + public: + Imputer(MapperType mapper, bool columnMajor = true): + mapper(std::move(mapper)), + columnMajor(columnMajor) + { + // Nothing to initialize here. + } + + Imputer(MapperType mapper, StrategyType strategy, bool columnMajor = true): + strategy(std::move(strategy)), + mapper(std::move(mapper)), + columnMajor(columnMajor) + { + // Nothing to initialize here. + } + + /** + * Given an input dataset, replace missing values with given imputation + * strategy. This overload does not produce output matrix, but overwrites the + * result into the input matrix. + * + * @param input Input dataset to apply imputation. + * @oaran missingValue User defined missing value; it can be anything. + * @param dimension Dimension to apply the imputation. + */ + void Impute(arma::Mat& input, + const std::string& missingValue, + const size_t dimension) + { + T mappedValue = static_cast(mapper.UnmapValue(missingValue, dimension)); + strategy.Impute(input, mappedValue, dimension, columnMajor); + } + + //! Get the strategy + const StrategyType& Strategy() const { return strategy; } + + //! Modify the given given strategy (be careful!) + StrategyType& Strategy() { return strategy; } + + //! Get the mapper + const MapperType& Mapper() const { return mapper; } + + //! Modify the given mapper (be careful!) + MapperType& Mapper() { return mapper; } + + private: + // StrategyType + StrategyType strategy; + + // DatasetMapperType + MapperType mapper; + + // save columnMajor as a member variable since it is rarely changed. + bool columnMajor; + +}; // class Imputer + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 19e238a9403..40d3834e3fe 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -14,7 +14,7 @@ #include #include "format.hpp" -#include "dataset_info.hpp" +#include "dataset_mapper.hpp" namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { @@ -61,10 +61,10 @@ bool Load(const std::string& filename, /** * Loads a matrix from a file, guessing the filetype from the extension and - * mapping categorical features with a DatasetInfo object. This will transpose - * the matrix (unless the transpose parameter is set to false). This particular - * overload of Load() can only load text-based formats, such as those given - * below: + * mapping categorical features with a DatasetMapper object. This will + * transpose the matrix (unless the transpose parameter is set to false). + * This particular overload of Load() can only load text-based formats, such as + * those given below: * * - CSV (csv_ascii), denoted by .csv, or optionally .txt * - TSV (raw_ascii), denoted by .tsv, .csv, or .txt @@ -81,20 +81,20 @@ bool Load(const std::string& filename, * mlpack requires column-major matrices, this should be left at its default * value of 'true'. * - * The DatasetInfo object passed to this function will be re-created, so any + * The DatasetMapper object passed to this function will be re-created, so any * mappings from previous loads will be lost. * * @param filename Name of file to load. * @param matrix Matrix to load contents of file into. - * @param info DatasetInfo object to populate with mappings and data types. + * @param info DatasetMapper object to populate with mappings and data types. * @param fatal If an error should be reported as fatal (default false). * @param transpose If true, transpose the matrix after loading. * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Mat& matrix, - DatasetInfo& info, + DatasetMapper& info, const bool fatal = false, const bool transpose = true); diff --git a/src/mlpack/core/data/load_arff.hpp b/src/mlpack/core/data/load_arff.hpp index f04e38ab8bd..ff6c4315920 100644 --- a/src/mlpack/core/data/load_arff.hpp +++ b/src/mlpack/core/data/load_arff.hpp @@ -42,10 +42,10 @@ void LoadARFF(const std::string& filename, arma::Mat& matrix); * @param info DatasetInfo object; can be default-constructed or pre-existing * from another call to LoadARFF(). */ -template +template void LoadARFF(const std::string& filename, arma::Mat& matrix, - DatasetInfo& info); + DatasetMapper& info); } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 68c9184fe71..71ccea64a86 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -15,10 +15,10 @@ namespace mlpack { namespace data { -template +template void LoadARFF(const std::string& filename, arma::Mat& matrix, - DatasetInfo& info) + DatasetMapper& info) { // First, open the file. std::ifstream ifs; @@ -98,7 +98,7 @@ void LoadARFF(const std::string& filename, // Reset the DatasetInfo object, if needed. if (info.Dimensionality() == 0) { - info = DatasetInfo(dimensionality); + info = DatasetMapper(dimensionality); } else if (info.Dimensionality() != dimensionality) { diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 5479bab17d5..45266b52644 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -59,43 +59,6 @@ void TransPoseTokens(std::vector> const &input, } } -template -void MapToNumerical(const std::vector &tokens, - size_t &row, - DatasetInfo &info, - arma::Mat &matrix) -{ - auto notNumber = [](const std::string &str) - { - eT val(0); - std::stringstream token; - token.str(str); - token>>val; - return token.fail(); - }; - - const bool notNumeric = std::any_of(std::begin(tokens), - std::end(tokens), notNumber); - if(notNumeric) - { - for(size_t i = 0; i != tokens.size(); ++i) - { - const eT val = static_cast(info.MapString(tokens[i], row)); - matrix.at(row, i) = val; - } - } - else - { - std::stringstream token; - for(size_t i = 0; i != tokens.size(); ++i) - { - token.str(tokens[i]); - token>>matrix.at(row, i); - token.clear(); - } - } -} - } template @@ -370,10 +333,10 @@ bool Load(const std::string& filename, } // Load with mappings. Unfortunately we have to implement this ourselves. -template +template bool Load(const std::string& filename, arma::Mat& matrix, - DatasetInfo& info, + DatasetMapper& info, const bool fatal, const bool transpose) { @@ -446,16 +409,16 @@ bool Load(const std::string& filename, if (transpose) { matrix.set_size(cols, rows); - info = DatasetInfo(cols); + info = DatasetMapper(info.Policy(), cols); } else { matrix.set_size(rows, cols); - info = DatasetInfo(rows); + info = DatasetMapper(info.Policy(), rows); } stream.close(); - stream.open(filename, std::fstream::in); + stream.open(filename, std::fstream::in); if(transpose) { @@ -475,8 +438,7 @@ bool Load(const std::string& filename, for(size_t i = 0; i != cols; ++i) { details::TransPoseTokens(tokensArray, tokens, i); - details::MapToNumerical(tokens, i, - info, matrix); + info.MapTokens(tokens, i, matrix); } } else @@ -487,8 +449,7 @@ bool Load(const std::string& filename, // Extract line by line. std::getline(stream, buffer, '\n'); Tokenizer lineTok(buffer, sep); - details::MapToNumerical(details::ToTokens(lineTok), row, - info, matrix); + info.MapTokens(details::ToTokens(lineTok), row, matrix); ++row; } } diff --git a/src/mlpack/core/data/map_policies/CMakeLists.txt b/src/mlpack/core/data/map_policies/CMakeLists.txt new file mode 100644 index 00000000000..9b40fcc19ce --- /dev/null +++ b/src/mlpack/core/data/map_policies/CMakeLists.txt @@ -0,0 +1,15 @@ +# Define the files we need to compile +# Anything not in this list will not be compiled into mlpack. +set(SOURCES + increment_policy.hpp + missing_policy.hpp +) + +# Add directory name to sources. +set(DIR_SRCS) +foreach(file ${SOURCES}) + set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) +endforeach() +# Append sources (with directory name) to list of all mlpack sources (used at +# the parent scope). +set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) diff --git a/src/mlpack/core/data/map_policies/datatype.hpp b/src/mlpack/core/data/map_policies/datatype.hpp new file mode 100644 index 00000000000..3a3b1ac137e --- /dev/null +++ b/src/mlpack/core/data/map_policies/datatype.hpp @@ -0,0 +1,28 @@ +/** + * @file missing_policy.hpp + * @author Keon Kim + * + */ +#ifndef MLPACK_CORE_DATA_MAP_POLICIES_DATATYPE_HPP +#define MLPACK_CORE_DATA_MAP_POLICIES_DATATYPE_HPP + +#include + +namespace mlpack { +namespace data { +/** + * The Datatype enum specifies the types of data mlpack algorithms can use. + * The vast majority of mlpack algorithms can only use numeric data (i.e. + * float/double/etc.), but some algorithms can use categorical data, specified + * via this Datatype enum and the DatasetMapper class. + */ +enum Datatype : bool /* [> bool is all the precision we need for two types <] */ +{ + numeric = 0, + categorical = 1 +}; + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp new file mode 100644 index 00000000000..4ff7341a653 --- /dev/null +++ b/src/mlpack/core/data/map_policies/increment_policy.hpp @@ -0,0 +1,131 @@ +/** + * @file increment_policy.hpp + * @author Keon Kim + * + * Default increment maping policy for dataset info. + */ +#ifndef MLPACK_CORE_DATA_MAP_POLICIES_INCREMENT_POLICY_HPP +#define MLPACK_CORE_DATA_MAP_POLICIES_INCREMENT_POLICY_HPP + +#include +#include +#include +#include + +namespace mlpack { +namespace data { +/** + * IncrementPolicy is used as a helper class for DatasetMapper. It tells how the + * strings should be mapped. Purpose of this policy is to map all dimension if + * one if the variables in a dimension turns out to be a categorical variable. + * IncrementPolicy maps strings to incrementing unsigned integers (size_t). + * The first string to be mapped will be mapped to 0, the next to 1 and so on. + */ +class IncrementPolicy +{ + public: + // typedef of MappedType + using MappedType = size_t; + + /** + * Given the string and the dimension to which the it belongs, and the maps + * and types given by the DatasetMapper class, returns its numeric mapping. + * If no mapping yet exists, the string is added to the list of mappings for + * the given dimension. This function is used as a helper function for + * DatasetMapper class. + * + * @tparam MapType Type of unordered_map that contains mapped value pairs + * @param string String to find/create mapping for. + * @param dimension Index of the dimension of the string. + * @param maps Unordered map given by the DatasetMapper. + * @param types Vector containing the type information about each dimensions. + */ + template + MappedType MapString(const std::string& string, + const size_t dimension, + MapType& maps, + std::vector& types) + { + // If this condition is true, either we have no mapping for the given string + // or we have no mappings for the given dimension at all. In either case, + // we create a mapping. + if (maps.count(dimension) == 0 || + maps[dimension].first.left.count(string) == 0) + { + // This string does not exist yet. + size_t& numMappings = maps[dimension].second; + + // change type of the feature to categorical + if (numMappings == 0) + types[dimension] = Datatype::categorical; + + typedef boost::bimap::value_type PairType; + maps[dimension].first.insert(PairType(string, numMappings)); + return numMappings++; + } + else + { + // This string already exists in the mapping. + return maps[dimension].first.left.at(string); + } + } + + /** + * MapTokens turns vector of strings into numeric variables and puts them + * into a given matrix. It is used as a helper function when trying to load + * files. Each dimension's tokens are given in to this function. If one of the + * tokens turns out to be a string, all the tokens should be mapped using the + * MapString() funciton. + * + * @tparam eT Type of armadillo matrix. + * @tparam MapType Type of unordered_map that contains mapped value pairs. + * @param tokens Vector of variables inside a dimension. + * @param row Position of the given tokens. + * @param matrix Matrix to save the data into. + * @param maps Maps given by the DatasetMapper class. + * @param types Types of each dimensions given by the DatasetMapper class. + */ + template + void MapTokens(const std::vector& tokens, + size_t& row, + arma::Mat& matrix, + MapType& maps, + std::vector& types) + { + auto notNumber = [](const std::string& str) + { + eT val(0); + std::stringstream token; + token.str(str); + token >> val; + return token.fail(); + }; + + const bool notNumeric = std::any_of(std::begin(tokens), + std::end(tokens), notNumber); + if (notNumeric) + { + for (size_t i = 0; i != tokens.size(); ++i) + { + const eT val = static_cast(this->MapString(tokens[i], row, maps, + types)); + matrix.at(row, i) = val; + } + } + else + { + std::stringstream token; + for (size_t i = 0; i != tokens.size(); ++i) + { + token.str(tokens[i]); + token >> matrix.at(row, i); + token.clear(); + } + } + } +}; // class IncrementPolicy + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp new file mode 100644 index 00000000000..ff60a5a0ce5 --- /dev/null +++ b/src/mlpack/core/data/map_policies/missing_policy.hpp @@ -0,0 +1,151 @@ +/** + * @file missing_policy.hpp + * @author Keon Kim + * + * Missing map policy for dataset info. + */ +#ifndef MLPACK_CORE_DATA_MAP_POLICIES_MISSING_POLICY_HPP +#define MLPACK_CORE_DATA_MAP_POLICIES_MISSING_POLICY_HPP + +#include +#include +#include +#include +#include + +namespace mlpack { +namespace data { +/** + * MissingPolicy is used as a helper class for DatasetMapper. It tells how the + * strings should be mapped. Purpose of this policy is to map all user-defined + * missing variables into maps so that users can decide what to do with the + * corrupted data. User-defined missing variables are given by the missingSet. + * Note that MissingPolicy does not change type of features. + */ +class MissingPolicy +{ + public: + // typedef of MappedType + using MappedType = double; + + MissingPolicy() + { + // Nothing to initialize here. + } + + /** + * Create the MissingPolicy object with the given missingSet. Note that the + * missingSet cannot be changed later; you will have to create a new + * MissingPolicy object. + * + * @param missingSet Set of strings that should be mapped. + */ + explicit MissingPolicy(std::set missingSet) : + missingSet(std::move(missingSet)) + { + // Nothing to initialize here. + } + + /** + * Given the string and the dimension to which it belongs by the user, and + * the maps and types given by the DatasetMapper class, returns its numeric + * mapping. If no mapping yet exists and the string is included in the + * missingSet, the string is added to the list of mappings for the given + * dimension. This function is used as a helper function for DatasetMapper + * class. + * + * @tparam MapType Type of unordered_map that contains mapped value pairs + * @param string String to find/create mapping for. + * @param dimension Index of the dimension of the string. + * @param maps Unordered map given by the DatasetMapper. + * @param types Vector containing the type information about each dimensions. + */ + template + MappedType MapString(const std::string& string, + const size_t dimension, + MapType& maps, + std::vector& types) + { + // mute the unused parameter warning (does nothing here.) + (void)types; + // If this condition is true, either we have no mapping for the given string + // or we have no mappings for the given dimension at all. In either case, + // we create a mapping. + const double NaN = std::numeric_limits::quiet_NaN(); + if (missingSet.count(string) != 0 && + (maps.count(dimension) == 0 || + maps[dimension].first.left.count(string) == 0)) + { + // This string does not exist yet. + typedef boost::bimap::value_type PairType; + maps[dimension].first.insert(PairType(string, NaN)); + + size_t& numMappings = maps[dimension].second; + ++numMappings; + return NaN; + } + else + { + // This string already exists in the mapping + // or not included in missingSet. + // Unlike IncrementPolicy, MissingPolicy counts all mapped values. + size_t& numMappings = maps[dimension].second; + ++numMappings; + return NaN; + } + } + + /** + * MapTokens turns vector of strings into numeric variables and puts them + * into a given matrix. It is used as a helper function when trying to load + * files. Each dimension's tokens are given in to this function. If one of the + * tokens turns out to be a string or one of the missingSet's variables, only + * the token responsible for it should be mapped using the MapString() + * funciton. + * + * @tparam eT Type of armadillo matrix. + * @tparam MapType Type of unordered_map that contains mapped value pairs. + * @param tokens Vector of variables inside a dimension. + * @param row Position of the given tokens. + * @param matrix Matrix to save the data into. + * @param maps Maps given by the DatasetMapper class. + * @param types Types of each dimensions given by the DatasetMapper class. + */ + template + void MapTokens(const std::vector& tokens, + size_t& row, + arma::Mat& matrix, + MapType& maps, + std::vector& types) + { + // MissingPolicy allows double type matrix only, because it uses NaN. + static_assert(std::is_same::value, "You must use double type " + " matrix in order to apply MissingPolicy"); + + std::stringstream token; + for (size_t i = 0; i != tokens.size(); ++i) + { + token.str(tokens[i]); + token>>matrix.at(row, i); + // if the token is not number, map it. + // or if token is a number, but is included in the missingSet, map it. + if (token.fail() || missingSet.find(tokens[i]) != std::end(missingSet)) + { + const eT val = static_cast(this->MapString(tokens[i], row, maps, + types)); + matrix.at(row, i) = val; + } + token.clear(); + } + } + + private: + // Note that missingSet and maps are different. + // missingSet specifies which value/string should be mapped. + std::set missingSet; +}; // class MissingPolicy + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/methods/preprocess/CMakeLists.txt b/src/mlpack/methods/preprocess/CMakeLists.txt index b10c8eacb20..0eee8acd9d6 100644 --- a/src/mlpack/methods/preprocess/CMakeLists.txt +++ b/src/mlpack/methods/preprocess/CMakeLists.txt @@ -16,4 +16,4 @@ set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) add_cli_executable(preprocess_split) add_cli_executable(preprocess_binarize) #add_cli_executable(preprocess_scan) -#add_cli_executable(preprocess_imputer) +add_cli_executable(preprocess_imputer) diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp new file mode 100644 index 00000000000..2863b3e65a3 --- /dev/null +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -0,0 +1,174 @@ +/** + * @file preprocess_imputer_main.cpp + * @author Keon Kim + * + * a utility that provides imputation strategies fore + * missing values. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +PROGRAM_INFO("Impute Data", "This utility takes a dataset and converts user " + "defined missing variable to another to provide more meaningful analysis " + "\n\n" + "The program does not modify the original file, but instead makes a " + "separate file to save the output data; You can save the output by " + "specifying the file name with --output_file (-o)." + "\n\n" + "For example, if we consider 'NULL' in dimension 0 to be a missing " + "variable and want to delete whole row containing the NULL in the " + "column-wise dataset, and save the result to result.csv, we could run" + "\n\n" + "$ mlpack_preprocess_imputer -i dataset.csv -o result.csv -m NULL -d 0 \n" + "> -s listwise_deletion"); + +PARAM_STRING_IN_REQ("input_file", "File containing data,", "i"); +PARAM_STRING_OUT("output_file", "File to save output", "o"); +PARAM_STRING_IN("missing_value", "User defined missing value", "m", ""); +PARAM_STRING_IN("strategy", "imputation strategy to be applied", "s", ""); +PARAM_DOUBLE_IN("custom_value", "user_defined custom value", "c", 0.0); +PARAM_INT_IN("dimension", "the dimension to apply imputation", "d", 0); + +using namespace mlpack; +using namespace arma; +using namespace std; +using namespace data; + +int main(int argc, char** argv) +{ + CLI::ParseCommandLine(argc, argv); + + const string inputFile = CLI::GetParam("input_file"); + const string outputFile = CLI::GetParam("output_file"); + const string missingValue = CLI::GetParam("missing_value"); + const double customValue = CLI::GetParam("custom_value"); + const size_t dimension = (size_t) CLI::GetParam("dimension"); + string strategy = CLI::GetParam("strategy"); + + // The program needs user-defined missing values. + // Missing values can be any list of strings such as "1", "a", "NULL". + if (!CLI::HasParam("missing_value")) + Log::Fatal << "--missing_value must be specified in order to perform " + << "any imputation strategies." << endl; + + if (!CLI::HasParam("strategy")) + Log::Fatal << "--strategy must be specified in order to perform " + << "imputation."<< endl; + + if (!CLI::HasParam("output_file")) + Log::Warn << "--output_file is not specified, no " + << "results from this program will be saved!" << endl; + + if (!CLI::HasParam("dimension")) + Log::Warn << "--dimension is not specified, the imputation will be " + << "applied to all dimensions."<< endl; + + // If custom value is specified, and imputation strategy is not, + // set imputation strategy to "custom" + if (CLI::HasParam("custom_value") && !CLI::HasParam("strategy")) + { + strategy = "custom"; + Log::Warn << "--custom_value is specified without --strategy, " + << "--strategy is automatically set to 'custom'." << endl; + } + + // Custom value and any other impute strategies cannot be specified at + // the same time. + if (CLI::HasParam("custom_value") && CLI::HasParam("strategy") && + strategy != "custom") + Log::Fatal << "--custom_value cannot be specified with " + << "impute strategies excluding 'custom' strategy" << endl; + + // custom_value must be specified when using "custom" imputation strategy + if ((strategy == "custom") && !CLI::HasParam("custom_value")) + Log::Fatal << "--custom_value must be specified when using " + << "'custom' strategy" << endl; + + arma::mat input; + arma::mat output; + // Policy tells how the DatasetMapper should map the values. + std::set missingSet; + missingSet.insert(missingValue); + MissingPolicy policy(missingSet); + using MapperType = DatasetMapper; + DatasetMapper info(policy); + std::vector dirtyDimensions; + + Load(inputFile, input, info, true, true); + + // print how many mapping exist in each dimensions + for (size_t i = 0; i < input.n_rows; ++i) + { + size_t numMappings = info.NumMappings(i); + Log::Info << numMappings << " mappings in dimension " << i << "." + << endl; + if (numMappings > 0) + { + dirtyDimensions.push_back(i); + } + } + + // Initialize imputer class + Imputer> imputer(info); + if (strategy == "mean") + { + Imputer> imputer(info); + } + else if (strategy == "median") + { + Imputer> imputer(info); + } + else if (strategy == "listwise_deletion") + { + Imputer> imputer(info); + } + else if (strategy == "custom") + { + CustomImputation strat(customValue); + Imputer> imputer(info, strat); + } + else + { + Log::Fatal << "'" << strategy << "' imputation strategy does not exist" + << endl; + } + + Timer::Start("imputation"); + if (CLI::HasParam("dimension")) + { + // when --dimension is specified, + // the program will apply the changes to only the given dimension. + Log::Info << "Performing '" << strategy << "' imputation strategy " + << "to replace '" << missingValue << "' on dimension " << dimension + << "." << endl; + + imputer.Impute(input, missingValue, dimension); + } + else + { + // when --dimension is not specified, + // the program will apply the changes to all dimensions. + Log::Info << "Performing '" << strategy << "' imputation strategy " + << "to replace '" << missingValue << "' on all dimensions." << endl; + + for (size_t i : dirtyDimensions) + { + imputer.Impute(input, missingValue, i); + } + } + Timer::Stop("imputation"); + + if (!outputFile.empty()) + { + Log::Info << "Saving results to '" << outputFile << "'." << endl; + Save(outputFile, input, false); + } +} + diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index b13494dff09..7db9026eb02 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -22,6 +22,7 @@ add_executable(mlpack_test gmm_test.cpp hmm_test.cpp hoeffding_tree_test.cpp + imputation_test.cpp ind2sub_test.cpp init_rules_test.cpp kernel_test.cpp diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp new file mode 100644 index 00000000000..ce48ad0bddb --- /dev/null +++ b/src/mlpack/tests/imputation_test.cpp @@ -0,0 +1,266 @@ +/** + * @file imputation_test.cpp + * @author Keon Kim + * + * Tests for data::Imputer class + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "test_tools.hpp" + +using namespace mlpack; +using namespace mlpack::data; +using namespace std; + +BOOST_AUTO_TEST_SUITE(ImputationTest); +/** + * 1. Make sure a CSV is loaded correctly with mappings using MissingPolicy. + * 2. Try Imputer object with CustomImputation method to impute data "a". + * (It is ok to test on one method since the other ones will be covered in the + * next cases). + */ +BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest) +{ + fstream f; + f.open("test_file.csv", fstream::out); + f << "a, 2, 3" << endl; + f << "5, 6, a" << endl; + f << "8, 9, 10" << endl; + f.close(); + + arma::mat input; + + std::set mset; + mset.insert("a"); + MissingPolicy policy(mset); + DatasetMapper info(policy); + BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true); + + // row and column test + BOOST_REQUIRE_EQUAL(input.n_rows, 3); + BOOST_REQUIRE_EQUAL(input.n_cols, 3); + + // Load check + // MissingPolicy should convert strings to nans + BOOST_REQUIRE(std::isnan(input(0, 0)) == true); + BOOST_REQUIRE_CLOSE(input(0, 1), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 2), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 0), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 2), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 0), 3.0, 1e-5); + BOOST_REQUIRE(std::isnan(input(2, 1)) == true); + BOOST_REQUIRE_CLOSE(input(2, 2), 10.0, 1e-5); + + // convert missing vals to 99. + CustomImputation customStrategy(99); + Imputer, + CustomImputation> imputer(info, customStrategy); + // convert a or nan to 99 for dimension 0 + imputer.Impute(input, "a", 0); + + // Custom imputation result check + BOOST_REQUIRE_CLOSE(input(0, 0), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 1), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(0, 2), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 0), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(1, 2), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(input(2, 0), 3.0, 1e-5); + BOOST_REQUIRE(std::isnan(input(2, 1)) == true); // remains as NaN + BOOST_REQUIRE_CLOSE(input(2, 2), 10.0, 1e-5); + + // Remove the file. + remove("test_file.csv"); +} + +/** + * Make sure CustomImputation method replaces data 0 to 99. + */ +BOOST_AUTO_TEST_CASE(CustomImputationTest) +{ + arma::mat columnWiseInput("3.0 0.0 2.0 0.0;" + "5.0 6.0 0.0 6.0;" + "9.0 8.0 4.0 8.0;"); + arma::mat rowWiseInput(columnWiseInput); + double customValue = 99; + double mappedValue = 0.0; + + CustomImputation imputer(customValue); + + // column wise + imputer.Impute(columnWiseInput, mappedValue, 0/*dimension*/, true); + + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 1), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 3), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 3), 8.0, 1e-5); + + // row wise + imputer.Impute(rowWiseInput, mappedValue, 1, false); + + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 1), 99.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 3), 8.0, 1e-5); +} + +/** + * Make sure MeanImputation method replaces data 0 to mean value of each + * dimensions. + */ +BOOST_AUTO_TEST_CASE(MeanImputationTest) +{ + arma::mat columnWiseInput("3.0 0.0 2.0 0.0;" + "5.0 6.0 0.0 6.0;" + "9.0 8.0 4.0 8.0;"); + arma::mat rowWiseInput(columnWiseInput); + double mappedValue = 0.0; + + MeanImputation imputer; + + // column wise + imputer.Impute(columnWiseInput, mappedValue, 0, true); + + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 1), 2.5, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 3), 2.5, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 3), 8.0, 1e-5); + + // row wise + imputer.Impute(rowWiseInput, mappedValue, 1, false); + + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 1), 7.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 3), 8.0, 1e-5); +} + +/** + * Make sure MeanImputation method replaces data 0 to median value of each + * dimensions. + */ +BOOST_AUTO_TEST_CASE(MedianImputationTest) +{ + arma::mat columnWiseInput("3.0 0.0 2.0 0.0;" + "5.0 6.0 0.0 6.0;" + "9.0 8.0 4.0 8.0;"); + arma::mat rowWiseInput(columnWiseInput); + double mappedValue = 0.0; + + MedianImputation imputer; + + // column wise + imputer.Impute(columnWiseInput, mappedValue, 1, true); + + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 1), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 2), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 3), 8.0, 1e-5); + + // row wise + imputer.Impute(rowWiseInput, mappedValue, 1, false); + + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 1), 7.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 2), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 3), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(2, 2), 4.0, 1e-5); +} + +/** + * Make sure ListwiseDeletion method deletes the whole column (if column wise) + * or the row (if row wise) containing value of 0. + */ +BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) +{ + arma::mat columnWiseInput("3.0 0.0 2.0 0.0;" + "5.0 6.0 0.0 6.0;" + "9.0 8.0 4.0 8.0;"); + arma::mat rowWiseInput(columnWiseInput); + double mappedValue = 0.0; + + ListwiseDeletion imputer; + + // column wise + imputer.Impute(columnWiseInput, mappedValue, 0, true); // column wise + + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 0), 3.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(0, 1), 2.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(1, 1), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(columnWiseInput(2, 1), 4.0, 1e-5); + + // row wise + imputer.Impute(rowWiseInput, mappedValue, 1, false); // row wise + + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 0), 5.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 1), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 2), 0.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(0, 3), 6.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 0), 9.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 1), 8.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 2), 4.0, 1e-5); + BOOST_REQUIRE_CLOSE(rowWiseInput(1, 3), 8.0, 1e-5); +} + +BOOST_AUTO_TEST_SUITE_END(); diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp index 4eb8f12598e..27930edbef3 100644 --- a/src/mlpack/tests/load_save_test.cpp +++ b/src/mlpack/tests/load_save_test.cpp @@ -1401,6 +1401,8 @@ BOOST_AUTO_TEST_CASE(HarderKeonTest) BOOST_REQUIRE_EQUAL(ntInfo.NumMappings(1), 5); BOOST_REQUIRE_EQUAL(ntInfo.NumMappings(2), 5); BOOST_REQUIRE_EQUAL(ntInfo.NumMappings(3), 3); + + remove("test.csv"); } /**