mlpack · stereomatchingkiss · Jul 25, 2016 · Jun 1, 2016 · Jun 6, 2016 · Jun 6, 2016
diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
@@ -37,7 +37,13 @@ class DatasetMapper
    */
   explicit DatasetMapper(const size_t dimensionality = 0);
 
+  /**
+   * Create the DatasetMapper object with the given policy and dimensionality.
+   * Note that the dimensionality cannot be changed later; you will have to
+   * create a new DatasetMapper object. Policy can be modified by the modifier.
+   */
   explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0);
+
   /**
    * Given the string and the dimension to which it belongs, return its numeric
    * mapping.  If no mapping yet exists, the string is added to the list of
@@ -101,8 +107,12 @@ class DatasetMapper
     ar & data::CreateNVP(maps, "maps");
   }
 
+  //! Return the policy of the mapper.
   PolicyType& Policy() const;
 
+  //! Modify the policy of the mapper (be careful!).
+  PolicyType& Policy();
+
  private:
   //! Types of each dimension.
   std::vector<Datatype> types;

diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -121,6 +121,13 @@ inline PolicyType& DatasetMapper<PolicyType>::Policy() const
   return this->policy;
 }
 
+template<typename PolicyType>
+inline PolicyType& DatasetMapper<PolicyType>::Policy()
+{
+  return this->policy;
+}
+
+
 } // namespace data
 } // namespace mlpack
 

diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp
@@ -96,52 +96,6 @@ bool Load(const std::string& filename,
           arma::Mat<eT>& matrix,
           DatasetMapper<PolicyType>& info,
           const bool fatal = false,
-          const bool transpose = true)
-{
-  PolicyType policy;
-  return Load(filename, matrix, info, policy, fatal, transpose);
-}
-
-/**
- * Loads a matrix from a file, guessing the filetype from the extension and
- * mapping categorical features with a DatasetMapper object.  This will
- * transpose the matrix (unless the transpose parameter is set to false).
- * This particular overload of Load() can only load text-based formats, such as
- * those given below:
- *
- * - CSV (csv_ascii), denoted by .csv, or optionally .txt
- * - TSV (raw_ascii), denoted by .tsv, .csv, or .txt
- * - ASCII (raw_ascii), denoted by .txt
- *
- * If the file extension is not one of those types, an error will be given.
- * This is preferable to Armadillo's default behavior of loading an unknown
- * filetype as raw_binary, which can have very confusing effects.
- *
- * If the parameter 'fatal' is set to true, a std::runtime_error exception will
- * be thrown if the matrix does not load successfully.  The parameter
- * 'transpose' controls whether or not the matrix is transposed after loading.
- * In most cases, because data is generally stored in a row-major format and
- * mlpack requires column-major matrices, this should be left at its default
- * value of 'true'.
- *
- * The DatasetMapper object passed to this function will be re-created, so any
- * mappings from previous loads will be lost. policy is passed to the
- * constructor of DatasetMapper to create a new instance.
- *
- * @param filename Name of file to load.
- * @param matrix Matrix to load contents of file into.
- * @param info DatasetMapper object to populate with mappings and data types.
- * @param policy Policy class that decides how the DatasetMapper should map.
- * @param fatal If an error should be reported as fatal (default false).
- * @param transpose If true, transpose the matrix after loading.
- * @return Boolean value indicating success or failure of load.
- */
-template<typename eT, typename PolicyType>
-bool Load(const std::string& filename,
-          arma::Mat<eT>& matrix,
-          DatasetMapper<PolicyType>& info,
-          PolicyType& policy,
-          const bool fatal = false,
           const bool transpose = true);
 
 /**

diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
@@ -369,18 +369,17 @@ bool Load(const std::string& filename,
   return success;
 }
 
-// Load with mappings and policy.
+// Load with mappings.  Unfortunately we have to implement this ourselves.
 template<typename eT, typename PolicyType>
 bool Load(const std::string& filename,
           arma::Mat<eT>& matrix,
           DatasetMapper<PolicyType>& info,
-          PolicyType& policy,
           const bool fatal,
           const bool transpose)
 {
   // Get the extension and load as necessary.
   Timer::Start("loading_data");
-  Log::Debug << "Load with Policy" << std::endl;
+
   // Get the extension.
   std::string extension = Extension(filename);
 
@@ -412,7 +411,7 @@ bool Load(const std::string& filename,
       type = "raw ASCII-formatted data";
 
     Log::Info << "Loading '" << filename << "' as " << type << ".  "
-        << std::flush;
+        << std::endl;
     std::string separators;
     if (commas)
       separators = ",";
@@ -447,14 +446,12 @@ bool Load(const std::string& filename,
     if (transpose)
     {
       matrix.set_size(cols, rows);
-      Log::Debug << "initialize datasetmapper with policy" << std::endl;
-      info = DatasetMapper<PolicyType>(policy, cols);
+      info = DatasetMapper<PolicyType>(info.Policy(), cols);
     }
     else
     {
       matrix.set_size(rows, cols);
-      Log::Debug << "initialize datasetmapper with policy" << std::endl;
-      info = DatasetMapper<PolicyType>(policy, rows);
+      info = DatasetMapper<PolicyType>(info.Policy(), rows);
     }
 
     stream.close();
@@ -499,7 +496,7 @@ bool Load(const std::string& filename,
   else if (extension == "arff")
   {
     Log::Info << "Loading '" << filename << "' as ARFF dataset.  "
-        << std::flush;
+        << std::endl;
     try
     {
       LoadARFF(filename, matrix, info);

diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp
@@ -24,7 +24,8 @@ namespace data {
 class IncrementPolicy
 {
  public:
-  typedef size_t mapped_type;
+  // typedef of mapped_type
+  using mapped_type = size_t;
 
   template <typename MapType>
   mapped_type MapString(MapType& maps,

diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -24,7 +24,8 @@ namespace data {
 class MissingPolicy
 {
  public:
-  typedef size_t mapped_type;
+  // typedef of mapped_type
+  using mapped_type = size_t;
 
   MissingPolicy()
   {
@@ -48,9 +49,10 @@ class MissingPolicy
     // If this condition is true, either we have no mapping for the given string
     // or we have no mappings for the given dimension at all.  In either case,
     // we create a mapping.
+    Log::Debug << "missingSet has: " << missingSet.count(string) << std::endl;
     if (missingSet.count(string) != 0 &&
-        maps.count(dimension) == 0 ||
-        maps[dimension].first.left.count(string) == 0)
+        (maps.count(dimension) == 0 ||
+         maps[dimension].first.left.count(string) == 0))
     {
       // This string does not exist yet.
       size_t& numMappings = maps[dimension].second;
@@ -62,6 +64,7 @@ class MissingPolicy
     else
     {
       // This string already exists in the mapping.
+      Log::Debug << "string already exists in the mapping" << std::endl;
       return maps[dimension].first.left.at(string);
     }
   }

diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -98,7 +98,7 @@ int main(int argc, char** argv)
   Log::Debug << "initalize info(policy)" << endl;
   DatasetMapper<MissingPolicy> info(policy);
 
-  Load<double, MissingPolicy>(inputFile, input, info, policy, true, true);
+  Load(inputFile, input, info, true, true);
 
   // for testing purpose
   Log::Info << input << endl;

diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
@@ -39,22 +39,34 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
 
   arma::mat input;
   arma::mat output;
-  string missingValue = "a";
-  double customValue = 99;
-  size_t feature = 0;
+  size_t dimension = 0;
 
-  DatasetInfo info;
+  std::set<string> mset;
+  mset.insert("a");
+  MissingPolicy miss(mset);
+  DatasetMapper<MissingPolicy> info(miss);
   BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true);
 
   BOOST_REQUIRE_EQUAL(input.n_rows, 3);
   BOOST_REQUIRE_EQUAL(input.n_cols, 3);
 
   /* TODO: Connect Load with the new DatasetMapper instead of DatasetInfo*/
 
-  //Imputer<double,
-          //DatasetInfo,
-          //CustomImputation<double>> impu(info);
-  //impu.Impute(input, output, missingValue, customValue, feature);
+  Imputer<double,
+          DatasetMapper<MissingPolicy>,
+          CustomImputation<double>> imputer(info);
+  imputer.Impute(input, output, "a", 99, dimension); // convert a -> 99
+
+  BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(0, 1), 2.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(0, 2), 3.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(1, 2), 7.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(2, 0), 8.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(2, 1), 9.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5);
+
   // Remove the file.
   remove("test_file.csv");
 }