From e049bb7e45be7cdef524a2f066ab78d630e868df Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 4 Jun 2016 19:25:20 +0800 Subject: [PATCH 01/40] add overload, able to move string --- src/mlpack/core/data/dataset_info.hpp | 16 +++++++++++++--- src/mlpack/core/data/dataset_info_impl.hpp | 7 ++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 29c7cee8be4..7406b459c70 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -53,8 +53,16 @@ class DatasetInfo * * @param string String to find/create mapping for. * @param dimension Index of the dimension of the string. - */ - size_t MapString(const std::string& string, const size_t dimension); + */ + size_t MapString(const std::string &string, const size_t dimension) + { + return MapString(string, dimension); + } + + size_t MapString(std::string &&string, const size_t dimension) + { + return MapString(std::move(string), dimension); + } /** * Return the string that corresponds to a given value in a given dimension. @@ -95,7 +103,7 @@ class DatasetInfo ar & data::CreateNVP(maps, "maps"); } - private: + private: //! Types of each dimension. std::vector types; @@ -104,6 +112,8 @@ class DatasetInfo std::unordered_map, size_t>> maps; + template + size_t MapString(T&& string, const size_t dimension); }; } // namespace data diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index a3ee24dc576..ed20f06c401 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -21,7 +21,8 @@ inline DatasetInfo::DatasetInfo(const size_t dimensionality) : } // Map the string to a numeric id. -inline size_t DatasetInfo::MapString(const std::string& string, +template +inline size_t DatasetInfo::MapString(T&& string, const size_t dimension) { // If this condition is true, either we have no mapping for the given string @@ -35,13 +36,13 @@ inline size_t DatasetInfo::MapString(const std::string& string, if (numMappings == 0) types[dimension] = Datatype::categorical; typedef boost::bimap::value_type PairType; - maps[dimension].first.insert(PairType(string, numMappings)); + maps[dimension].first.insert(PairType(std::forward(string), numMappings)); return numMappings++; } else { // This string already exists in the mapping. - return maps[dimension].first.left.at(string); + return maps[dimension].first.left.at(std::forward(string)); } } From c26052b449ba1062aba686b4da796b5a961fc639 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 4 Jun 2016 20:34:31 +0800 Subject: [PATCH 02/40] fix bug--infinite recursive call --- src/mlpack/core/data/dataset_info.hpp | 6 +++--- src/mlpack/core/data/dataset_info_impl.hpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp index 7406b459c70..29663f1260a 100644 --- a/src/mlpack/core/data/dataset_info.hpp +++ b/src/mlpack/core/data/dataset_info.hpp @@ -56,12 +56,12 @@ class DatasetInfo */ size_t MapString(const std::string &string, const size_t dimension) { - return MapString(string, dimension); + return MapStringImpl(string, dimension); } size_t MapString(std::string &&string, const size_t dimension) { - return MapString(std::move(string), dimension); + return MapStringImpl(std::move(string), dimension); } /** @@ -113,7 +113,7 @@ class DatasetInfo size_t>> maps; template - size_t MapString(T&& string, const size_t dimension); + size_t MapStringImpl(T&& string, const size_t dimension); }; } // namespace data diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp index ed20f06c401..dbcd57f83c8 100644 --- a/src/mlpack/core/data/dataset_info_impl.hpp +++ b/src/mlpack/core/data/dataset_info_impl.hpp @@ -22,8 +22,8 @@ inline DatasetInfo::DatasetInfo(const size_t dimensionality) : // Map the string to a numeric id. template -inline size_t DatasetInfo::MapString(T&& string, - const size_t dimension) +inline size_t DatasetInfo::MapStringImpl(T&& string, + const size_t dimension) { // If this condition is true, either we have no mapping for the given string // or we have no mappings for the given dimension at all. In either case, From cd7c895be3bf42145e482450a156b04dff79e6ce Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 4 Jun 2016 21:48:55 +0800 Subject: [PATCH 03/40] first commit --- src/mlpack/core/data/load_csv.hpp | 281 ++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 src/mlpack/core/data/load_csv.hpp diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp new file mode 100644 index 00000000000..aab567f59c6 --- /dev/null +++ b/src/mlpack/core/data/load_csv.hpp @@ -0,0 +1,281 @@ +/** + * @file load_csv.hpp + * @author ThamNgapWei + * + * This is a csv parsers which use to parse the csv file format + */ +#ifndef MLPACK_CORE_DATA_LOAD_CSV_HPP +#define MLPACK_CORE_DATA_LOAD_CSV_HPP + +#include + +#include +#include // Includes Armadillo. +#include + +#include "format.hpp" +#include "dataset_info.hpp" + +namespace mlpack { +namespace data /** Functions to load and save matrices and models. */ { + +namespace details /** Implementation details, please do not use them in production codes */{ + +//put the implementation details do not depend on template params +//out of template class, this could reduce duplicate binary codes +//if the compiler/linker not smart enough + +} + +/** + *Load the csv file.This class use boost::spirit + *to implement the parser, please refer to following link + *http://theboostcpplibraries.com/boost.spirit for quick review. + */ +class LoadCSV +{ +public: + explicit LoadCSV(std::string const &file) : inFile(file) + { + if(!inFile.is_open()) + { + throw std::runtime_error("LoadCSV can not open file"); + } + inFile.unsetf(std::ios::skipws); + } + + template + void Load(arma::Mat &inout, DatasetInfo &infoSet, bool transpose = true) + { + //please refer to the comments of ColSize if you do not familiar + //with boost::spirit yet + if(transpose) + { + TranposeParse(inout, infoSet); + } + else + { + NonTranposeParse(inout, infoSet); + } + } + + size_t ColSize() + { + //boost tokenizer or strtok can do the same thing, I use + //spirit at here because I think this is a nice example + using namespace boost::spirit; + using bsi_type = boost::spirit::istream_iterator; + using iter_type = boost::iterator_range; + + inFile.clear(); + inFile.seekg(0, std::ios::beg); + //spirit::qi requires iterators to be atleast forward iterators, + //but std::istream_iterator is input iteraotr, so we use + //boost::spirit::istream_iterator to overcome this problem + bsi_type begin(inFile); + bsi_type end; + size_t col = 0; + + //the parser of boost spirit can work with "actions"(functor) + //when the parser find match target, this functor will be executed + auto findColSize = [&col](iter_type){ ++col; }; + + //qi::char_ bite an character + //qi::char_(",\r\n") only bite a "," or "\r" or "\n" character + //* means the parser(ex : qi::char_) can bite [0, any size] of characters + //~ means negate, so ~qi::char_(",\r\n") means I want to bite anything except of ",\r\n" + //parse % "," means you want to parse string like "1,2,3,apple"(noticed it without last comma) + + //qi::raw restrict the automatic conversion of boost::spirit, without it, spirit parser + //will try to convert the string to std::string, this would cause memory allocation + //After we wrap the parser with qi::raw, the attribute(the data accepted by functor) will + //become boost::iterator_range, this could save a tons of memory allocations + qi::parse(begin, end, qi::raw[*~qi::char_(",\r\n")][findColSize] % ","); + + return col; + } + + size_t RowSize() + { + inFile.clear(); + inFile.seekg(0, std::ios::beg); + size_t row = 0; + std::string line; + while(std::getline(inFile, line)) + { + ++row; + } + + return row; + } + +private: + using iter_type = boost::iterator_range; + + struct ElemParser + { + //return int_parser if the type of T is_integral + template + static typename std::enable_if::value, + boost::spirit::qi::int_parser>::type + Parser() + { + return boost::spirit::qi::int_parser(); + } + + //return real_parser if T is floating_point + template + static typename std::enable_if::value, + boost::spirit::qi::real_parser>::type + Parser() + { + return boost::spirit::qi::real_parser(); + } + }; + + template + void NonTranposeParse(arma::Mat &inout, DatasetInfo &infoSet) + { + using namespace boost::spirit; + + size_t row = 0; + size_t col = 0; + infoSet = DatasetInfo(RowSize()); + std::string line; + inout.set_size(infoSet.Dimensionality(), ColSize()); + inFile.clear(); + inFile.seekg(0, std::ios::beg); + + auto setNum = [&](T val) + { + inout(row, col++) = val; + }; + auto setCharClass = [&](iter_type const &iter) + { + inout(row, col++) = + static_cast(infoSet.MapString(std::string(iter.begin(), iter.end()), + row)); + }; + + qi::rule numRule = CreateNumRule(); + qi::rule charRule = CreateCharRule(); + while(std::getline(inFile, line)) + { + auto begin = line.begin(); + const bool allNumber = + qi::parse(begin, line.end(), numRule[setNum] % ","); + if(!allNumber) + { + begin = line.begin(); + col = 0; + const bool canParse = qi::parse(begin, line.end(), + charRule[setCharClass] % ","); + if(!canParse) + { + throw std::runtime_error("LoadCSV cannot parse categories"); + break; + } + } + ++row; col = 0; + } + } + + template + void TranposeParse(arma::Mat &inout, DatasetInfo &infoSet) + { + infoSet = DatasetInfo(ColSize()); + inout.set_size(infoSet.Dimensionality(), RowSize()); + while(!TranposeParseImpl(inout, infoSet)) + { + + } + } + + template + bool TranposeParseImpl(arma::Mat &inout, DatasetInfo &infoSet) + { + using namespace boost::spirit; + + size_t row = 0; + size_t col = 0; + std::string line; + inFile.clear(); + inFile.seekg(0, std::ios::beg); + + auto setNum = [&](T val) + { + inout(row++, col) = val; + }; + auto setCharClass = [&](iter_type const &iter) + { + inout(row++, col) = + static_cast(infoSet.MapString(std::string(iter.begin(), iter.end()), + col)); + }; + + qi::rule numRule = CreateNumRule(); + qi::rule charRule = CreateCharRule(); + while(std::getline(inFile, line)) + { + auto begin = line.begin(); + const bool allNumber = + qi::parse(begin, line.end(), numRule[setNum] % ","); + if(!allNumber) + { + begin = line.begin(); + const size_t dimension = infoSet.NumMappings(col); + if((dimension == 0 && row == 0) || dimension != 0) + { + row = 0; + const bool canParse = qi::parse(begin, line.end(), + charRule[setCharClass] % ","); + if(!canParse) + { + throw std::runtime_error("LoadCSV cannot parse categories"); + } + } + else + { + return false; + } + } + row = 0; ++col; + } + + return true; + } + + template + boost::spirit::qi::rule CreateNumRule() const + { + using namespace boost::spirit; + + //elemParser will generate integer or real parser based on T + auto elemParser = ElemParser::Parser(); + //qi::skip can specify which characters you want to skip, + //in this example, elemParser will parse int or double value, + //but we do not want space to intefere it, so we skip it by qi::skip + + //qi::omit can omit the attributes of spirit, every parser of spirit + //has attribute(the type will pass into actions(functor)) + //if you do not omit it, the attribute combine with attribute may + //change the attribute + + //"-" means one or zero(same as "-" of EBNF) + return qi::skip(qi::char_(" "))[elemParser] >> -qi::omit[*qi::char_(" ")]; + } + + boost::spirit::qi::rule CreateCharRule() const + { + using namespace boost::spirit; + return -qi::omit[*qi::char_(" ")] >> qi::raw[*~qi::char_(" ,\r\n")] + >> -qi::omit[*qi::char_(" ")]; + } + + std::ifstream inFile; +}; + +} // namespace data +} // namespace mlpack + +#endif From 97713bd123200ae2a5ad89c9b8ded35da45f82f1 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 5 Jun 2016 00:22:27 +0800 Subject: [PATCH 04/40] 1 : fix bug, did not consider case like "210DM, 1~200" 2 : fix bug, cannot parse transpose file with correct result --- src/mlpack/core/data/load_csv.hpp | 71 ++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index aab567f59c6..89dd19c9239 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -11,6 +11,8 @@ #include #include // Includes Armadillo. + +#include #include #include "format.hpp" @@ -164,7 +166,10 @@ class LoadCSV auto begin = line.begin(); const bool allNumber = qi::parse(begin, line.end(), numRule[setNum] % ","); - if(!allNumber) + //input like 2-200 or 2DM will make the parser fail, + //so we have to make sure col == inout.n_cols, else parse + //the input line again + if(!allNumber || col != inout.n_cols) { begin = line.begin(); col = 0; @@ -185,19 +190,29 @@ class LoadCSV { infoSet = DatasetInfo(ColSize()); inout.set_size(infoSet.Dimensionality(), RowSize()); - while(!TranposeParseImpl(inout, infoSet)) + size_t parseTime = 0; + std::unordered_set mapRows; + while(!TranposeParseImpl(inout, infoSet, mapRows)) { - + //avoid infinite loop + ++parseTime; + infoSet = DatasetInfo(inout.n_rows); + if(parseTime == inout.n_rows) + { + return; + } } } template - bool TranposeParseImpl(arma::Mat &inout, DatasetInfo &infoSet) + bool TranposeParseImpl(arma::Mat &inout, DatasetInfo &infoSet, + std::unordered_set &mapRows) { using namespace boost::spirit; size_t row = 0; size_t col = 0; + size_t progress = 0; std::string line; inFile.clear(); inFile.seekg(0, std::ios::beg); @@ -205,12 +220,15 @@ class LoadCSV auto setNum = [&](T val) { inout(row++, col) = val; + ++progress; + //std::cout<(infoSet.MapString(std::string(iter.begin(), iter.end()), - col)); + progress++)); }; qi::rule numRule = CreateNumRule(); @@ -218,28 +236,39 @@ class LoadCSV while(std::getline(inFile, line)) { auto begin = line.begin(); - const bool allNumber = - qi::parse(begin, line.end(), numRule[setNum] % ","); - if(!allNumber) + const bool shouldMapNum = mapRows.find(row) != std::end(mapRows); + bool allNumber = false; + if(!shouldMapNum) { - begin = line.begin(); - const size_t dimension = infoSet.NumMappings(col); - if((dimension == 0 && row == 0) || dimension != 0) + allNumber = qi::parse(begin, line.end(), numRule[setNum] % ","); + } + //std::cout<<"progress "< Date: Sun, 5 Jun 2016 02:18:12 +0800 Subject: [PATCH 05/40] fix bug--category conversion should based on columns but not rows --- src/mlpack/core/data/load_csv.hpp | 85 ++++++++++++++++--------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 89dd19c9239..ce5c8c4b02f 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -7,7 +7,7 @@ #ifndef MLPACK_CORE_DATA_LOAD_CSV_HPP #define MLPACK_CORE_DATA_LOAD_CSV_HPP -#include +#include #include #include // Includes Armadillo. @@ -191,8 +191,8 @@ class LoadCSV infoSet = DatasetInfo(ColSize()); inout.set_size(infoSet.Dimensionality(), RowSize()); size_t parseTime = 0; - std::unordered_set mapRows; - while(!TranposeParseImpl(inout, infoSet, mapRows)) + std::set mapCols; + while(!TranposeParseImpl(inout, infoSet, mapCols)) { //avoid infinite loop ++parseTime; @@ -206,29 +206,50 @@ class LoadCSV template bool TranposeParseImpl(arma::Mat &inout, DatasetInfo &infoSet, - std::unordered_set &mapRows) + std::set &mapCols) { using namespace boost::spirit; + //static size_t loop = 0; + //std::cout<<"loop "<(infoSet.MapString(std::to_string(val), + progress)); + } + else + { + inout(row, col) = val; + } + ++progress; ++row; }; auto setCharClass = [&](iter_type const &iter) { - //std::cout<(infoSet.MapString(std::string(iter.begin(), iter.end()), - progress++)); + if(mapCols.find(progress) != std::end(mapCols)) + { + //std::cout<<"nstr("<(infoSet.MapString(std::string(iter.begin(), iter.end()), + progress)); + } + else + { + //std::cout<<"str("< numRule = CreateNumRule(); @@ -236,39 +257,21 @@ class LoadCSV while(std::getline(inFile, line)) { auto begin = line.begin(); - const bool shouldMapNum = mapRows.find(row) != std::end(mapRows); - bool allNumber = false; - if(!shouldMapNum) + row = 0; + progress = 0; + const size_t oldSize = mapCols.size(); + const bool canParse = qi::parse(begin, line.end(), + (numRule[setNum] | charRule[setCharClass]) % ","); + //std::cout< oldSize) { - //std::cout<<"not all number"< Date: Sun, 5 Jun 2016 10:25:56 +0800 Subject: [PATCH 06/40] remove useless codes --- src/mlpack/core/data/load_csv.hpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index ce5c8c4b02f..91a81ab6d9e 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -21,14 +21,6 @@ namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { -namespace details /** Implementation details, please do not use them in production codes */{ - -//put the implementation details do not depend on template params -//out of template class, this could reduce duplicate binary codes -//if the compiler/linker not smart enough - -} - /** *Load the csv file.This class use boost::spirit *to implement the parser, please refer to following link From 1c13764ac76524b43b51e91882af4d85f79ec6bf Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 5 Jun 2016 10:52:24 +0800 Subject: [PATCH 07/40] support tsv --- src/mlpack/core/data/load_csv.hpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 91a81ab6d9e..5f4bad25499 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -29,7 +29,9 @@ namespace data /** Functions to load and save matrices and models. */ { class LoadCSV { public: - explicit LoadCSV(std::string const &file) : inFile(file) + explicit LoadCSV(const std::string &file) : + extension(Extension(file)), + inFile(file) { if(!inFile.is_open()) { @@ -292,10 +294,20 @@ class LoadCSV boost::spirit::qi::rule CreateCharRule() const { using namespace boost::spirit; - return -qi::omit[*qi::char_(" ")] >> qi::raw[*~qi::char_(" ,\r\n")] - >> -qi::omit[*qi::char_(" ")]; + + if(extension == "csv") + { + return -qi::omit[*qi::char_(" ")] >> qi::raw[*~qi::char_(" ,\r\n")] + >> -qi::omit[*qi::char_(" ")]; + } + else + { + return -qi::omit[*qi::char_(" ")] >> qi::raw[*~qi::char_(" \t\r\n")] + >> -qi::omit[*qi::char_(" ")]; + } } + std::string extension; std::ifstream inFile; }; From 34ed51dc55cc020edb57c24543ce4a7e9308cc08 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 5 Jun 2016 10:57:39 +0800 Subject: [PATCH 08/40] use LoadCSV to implement csv/tsv/txt loader --- src/mlpack/core/data/load_impl.hpp | 181 +---------------------------- 1 file changed, 4 insertions(+), 177 deletions(-) diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 5479bab17d5..8654732f7aa 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -10,6 +10,7 @@ // In case it hasn't already been included. #include "load.hpp" #include "extension.hpp" +#include "load_csv.hpp" #include #include @@ -29,75 +30,6 @@ namespace mlpack { namespace data { -namespace details{ - -template -std::vector ToTokens(Tokenizer &lineTok) -{ - std::vector tokens; - std::transform(std::begin(lineTok), std::end(lineTok), - std::back_inserter(tokens), - [&tokens](std::string const &str) - { - std::string trimmedToken(str); - boost::trim(trimmedToken); - return std::move(trimmedToken); - }); - - return tokens; -} - -inline -void TransPoseTokens(std::vector> const &input, - std::vector &output, - size_t index) -{ - output.clear(); - for(size_t i = 0; i != input.size(); ++i) - { - output.emplace_back(input[i][index]); - } -} - -template -void MapToNumerical(const std::vector &tokens, - size_t &row, - DatasetInfo &info, - arma::Mat &matrix) -{ - auto notNumber = [](const std::string &str) - { - eT val(0); - std::stringstream token; - token.str(str); - token>>val; - return token.fail(); - }; - - const bool notNumeric = std::any_of(std::begin(tokens), - std::end(tokens), notNumber); - if(notNumeric) - { - for(size_t i = 0; i != tokens.size(); ++i) - { - const eT val = static_cast(info.MapString(tokens[i], row)); - matrix.at(row, i) = val; - } - } - else - { - std::stringstream token; - for(size_t i = 0; i != tokens.size(); ++i) - { - token.str(tokens[i]); - token>>matrix.at(row, i); - token.clear(); - } - } -} - -} - template bool inline inplace_transpose(arma::Mat& X) { @@ -381,117 +313,12 @@ bool Load(const std::string& filename, Timer::Start("loading_data"); // Get the extension. - std::string extension = Extension(filename); - - // Catch nonexistent files by opening the stream ourselves. - std::fstream stream; - stream.open(filename.c_str(), std::fstream::in); - - if (!stream.is_open()) - { - Timer::Stop("loading_data"); - if (fatal) - Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl; - else - Log::Warn << "Cannot open file '" << filename << "'; load failed." - << std::endl; - - return false; - } + const std::string extension = Extension(filename); if (extension == "csv" || extension == "tsv" || extension == "txt") { - // True if we're looking for commas; if false, we're looking for spaces. - bool commas = (extension == "csv"); - - std::string type; - if (extension == "csv") - type = "CSV data"; - else - type = "raw ASCII-formatted data"; - - Log::Info << "Loading '" << filename << "' as " << type << ". " - << std::flush; - std::string separators; - if (commas) - separators = ","; - else - separators = " \t"; - - // We'll load this as CSV (or CSV with spaces or tabs) according to - // RFC4180. So the first thing to do is determine the size of the matrix. - std::string buffer; - size_t cols = 0; - - std::getline(stream, buffer, '\n'); - // Count commas and whitespace in the line, ignoring anything inside - // quotes. - typedef boost::tokenizer> Tokenizer; - boost::escaped_list_separator sep("\\", separators, "\""); - Tokenizer tok(buffer, sep); - for (Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i) - ++cols; - - // Now count the number of lines in the file. We've already counted the - // first one. - size_t rows = 1; - while (!stream.eof() && !stream.bad() && !stream.fail()) - { - std::getline(stream, buffer, '\n'); - if (!stream.fail()) - ++rows; - } - - // Now we have the size. So resize our matrix. - if (transpose) - { - matrix.set_size(cols, rows); - info = DatasetInfo(cols); - } - else - { - matrix.set_size(rows, cols); - info = DatasetInfo(rows); - } - - stream.close(); - stream.open(filename, std::fstream::in); - - if(transpose) - { - std::vector> tokensArray; - std::vector tokens; - while (!stream.bad() && !stream.fail() && !stream.eof()) - { - // Extract line by line. - std::getline(stream, buffer, '\n'); - Tokenizer lineTok(buffer, sep); - tokens = details::ToTokens(lineTok); - if(tokens.size() == cols) - { - tokensArray.emplace_back(std::move(tokens)); - } - } - for(size_t i = 0; i != cols; ++i) - { - details::TransPoseTokens(tokensArray, tokens, i); - details::MapToNumerical(tokens, i, - info, matrix); - } - } - else - { - size_t row = 0; - while (!stream.bad() && !stream.fail() && !stream.eof()) - { - // Extract line by line. - std::getline(stream, buffer, '\n'); - Tokenizer lineTok(buffer, sep); - details::MapToNumerical(details::ToTokens(lineTok), row, - info, matrix); - ++row; - } - } + LoadCSV loader(filename); + loader.Load(matrix, info, transpose); } else if (extension == "arff") { From fe1feb88a6bb0b293aa6534fdd476361b78d5a5d Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 5 Jun 2016 11:00:33 +0800 Subject: [PATCH 09/40] fix bug--LoadCSV should parse txt parse file too --- src/mlpack/core/data/load_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 5f4bad25499..499c616acc6 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -295,7 +295,7 @@ class LoadCSV { using namespace boost::spirit; - if(extension == "csv") + if(extension == "csv" || extension == "txt") { return -qi::omit[*qi::char_(" ")] >> qi::raw[*~qi::char_(" ,\r\n")] >> -qi::omit[*qi::char_(" ")]; From 18650988c982e9e2b0f6dd2dfe0ece926af281c5 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 5 Jun 2016 13:13:25 +0800 Subject: [PATCH 10/40] can specify fatal or not if file cannot open --- src/mlpack/core/data/load_csv.hpp | 37 +++++++++++++++++++++++++----- src/mlpack/core/data/load_impl.hpp | 2 +- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 499c616acc6..34438b71d2f 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -29,20 +29,23 @@ namespace data /** Functions to load and save matrices and models. */ { class LoadCSV { public: - explicit LoadCSV(const std::string &file) : + explicit LoadCSV(std::string file, bool fatal = false) : extension(Extension(file)), + fatalIfOpenFail(fatal), + fileName(std::move(file)), inFile(file) { - if(!inFile.is_open()) - { - throw std::runtime_error("LoadCSV can not open file"); - } - inFile.unsetf(std::ios::skipws); + CanOpen(); } template void Load(arma::Mat &inout, DatasetInfo &infoSet, bool transpose = true) { + if(!CanOpen()) + { + return; + } + //please refer to the comments of ColSize if you do not familiar //with boost::spirit yet if(transpose) @@ -129,6 +132,26 @@ class LoadCSV } }; + bool CanOpen() + { + if(!inFile.is_open()) + { + if(fatalIfOpenFail) + { + Log::Fatal << "Cannot open file '" << fileName << "'. " << std::endl; + } + else + { + Log::Warn << "Cannot open file '" << fileName << "'; load failed." + << std::endl; + } + return false; + } + inFile.unsetf(std::ios::skipws); + + return true; + } + template void NonTranposeParse(arma::Mat &inout, DatasetInfo &infoSet) { @@ -308,6 +331,8 @@ class LoadCSV } std::string extension; + bool fatalIfOpenFail; + std::string fileName; std::ifstream inFile; }; diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 8654732f7aa..eaddb95b6df 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -317,7 +317,7 @@ bool Load(const std::string& filename, if (extension == "csv" || extension == "tsv" || extension == "txt") { - LoadCSV loader(filename); + LoadCSV loader(filename, fatal); loader.Load(matrix, info, transpose); } else if (extension == "arff") From 1afb484a5b42aaf8387013b161ca4380b2b6c73a Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 5 Jun 2016 13:20:50 +0800 Subject: [PATCH 11/40] fix bug--should not use empty string to open file --- src/mlpack/core/data/load_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 34438b71d2f..44ffa580703 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -33,7 +33,7 @@ class LoadCSV extension(Extension(file)), fatalIfOpenFail(fatal), fileName(std::move(file)), - inFile(file) + inFile(fileName) { CanOpen(); } From e8b216c9becf3fd8d48934cbed8ff0d308115acc Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 5 Jun 2016 13:43:47 +0800 Subject: [PATCH 12/40] treat \t and space as same category --- src/mlpack/core/data/load_csv.hpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 44ffa580703..79be6589594 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -171,8 +171,13 @@ class LoadCSV }; auto setCharClass = [&](iter_type const &iter) { + std::string str = std::string(iter.begin(), iter.end()); + if(str == "\t") + { + str.clear(); + } inout(row, col++) = - static_cast(infoSet.MapString(std::string(iter.begin(), iter.end()), + static_cast(infoSet.MapString(std::move(str), row)); }; @@ -256,8 +261,13 @@ class LoadCSV if(mapCols.find(progress) != std::end(mapCols)) { //std::cout<<"nstr("<(infoSet.MapString(std::string(iter.begin(), iter.end()), + static_cast(infoSet.MapString(std::move(str), progress)); } else From bc25da5af651af29356c85648e285ffe47894bd1 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 5 Jun 2016 14:37:17 +0800 Subject: [PATCH 13/40] refine string construct --- src/mlpack/core/data/load_csv.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 79be6589594..05d8b5e09c4 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -171,7 +171,7 @@ class LoadCSV }; auto setCharClass = [&](iter_type const &iter) { - std::string str = std::string(iter.begin(), iter.end()); + std::string str(iter.begin(), iter.end()); if(str == "\t") { str.clear(); @@ -261,7 +261,7 @@ class LoadCSV if(mapCols.find(progress) != std::end(mapCols)) { //std::cout<<"nstr("< Date: Mon, 6 Jun 2016 12:21:31 +0800 Subject: [PATCH 14/40] refine comments --- src/mlpack/core/data/load_csv.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 05d8b5e09c4..2804662b742 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -86,7 +86,8 @@ class LoadCSV //parse % "," means you want to parse string like "1,2,3,apple"(noticed it without last comma) //qi::raw restrict the automatic conversion of boost::spirit, without it, spirit parser - //will try to convert the string to std::string, this would cause memory allocation + //will try to convert the string to std::string, this may cause memory allocation(if small string + //optimization fail). //After we wrap the parser with qi::raw, the attribute(the data accepted by functor) will //become boost::iterator_range, this could save a tons of memory allocations qi::parse(begin, end, qi::raw[*~qi::char_(",\r\n")][findColSize] % ","); From 12fcc4706aed663cb742f1b5ddb5c61fd5711d6c Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Mon, 6 Jun 2016 12:33:09 +0800 Subject: [PATCH 15/40] refine comments --- src/mlpack/core/data/load_csv.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 2804662b742..7a4a0a71459 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -187,6 +187,8 @@ class LoadCSV while(std::getline(inFile, line)) { auto begin = line.begin(); + //parse the numbers from a line(ex : 1,2,3,4), if the parser find the number + //it will execute the setNum function const bool allNumber = qi::parse(begin, line.end(), numRule[setNum] % ","); //input like 2-200 or 2DM will make the parser fail, @@ -288,6 +290,8 @@ class LoadCSV row = 0; progress = 0; const size_t oldSize = mapCols.size(); + //parse number of characters from a line, it will execute setNum if it is number, + //else execute setCharClass, "|" means "if not a, then b" const bool canParse = qi::parse(begin, line.end(), (numRule[setNum] | charRule[setCharClass]) % ","); //std::cout< Date: Mon, 6 Jun 2016 14:45:38 +0800 Subject: [PATCH 16/40] refine comments and parsers --- src/mlpack/core/data/load_csv.hpp | 33 ++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 7a4a0a71459..968d3ce9562 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -187,14 +187,10 @@ class LoadCSV while(std::getline(inFile, line)) { auto begin = line.begin(); - //parse the numbers from a line(ex : 1,2,3,4), if the parser find the number - //it will execute the setNum function - const bool allNumber = - qi::parse(begin, line.end(), numRule[setNum] % ","); - //input like 2-200 or 2DM will make the parser fail, - //so we have to make sure col == inout.n_cols, else parse - //the input line again - if(!allNumber || col != inout.n_cols) + //parse the numbers from a line(ex : 1,2,3,4), if the parser find the number + //it will execute the setNum function + qi::parse(begin, line.end(), numRule[setNum] % ","); + if(col != inout.n_cols) { begin = line.begin(); col = 0; @@ -293,7 +289,7 @@ class LoadCSV //parse number of characters from a line, it will execute setNum if it is number, //else execute setCharClass, "|" means "if not a, then b" const bool canParse = qi::parse(begin, line.end(), - (numRule[setNum] | charRule[setCharClass]) % ","); + (numRule[setNum] | (charRule)[setCharClass]) % ","); //std::cout<> -qi::omit[*qi::char_(" ")]; + if(extension == "csv" || extension == "txt") + { + return qi::skip(qi::char_(" "))[elemParser] >> -qi::omit[*qi::char_(" ")] + >> &(qi::lit(",") | qi::eol | qi::eoi); + } + else + { + return qi::skip(qi::char_(" "))[elemParser] >> -qi::omit[*qi::char_(" ")] + >> &(qi::lit("\t") | qi::eol | qi::eoi); + } } boost::spirit::qi::rule CreateCharRule() const From 88d81ddb74155a835664a3eb1c3204b50f23a501 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Mon, 6 Jun 2016 20:19:58 +0800 Subject: [PATCH 17/40] add new test cases for strings like "200-DM" --- src/mlpack/tests/load_save_test.cpp | 89 +++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp index 3917aead1c7..8c7022a5697 100644 --- a/src/mlpack/tests/load_save_test.cpp +++ b/src/mlpack/tests/load_save_test.cpp @@ -1134,6 +1134,50 @@ BOOST_AUTO_TEST_CASE(CategoricalCSVLoadTest03) remove("test.csv"); } +BOOST_AUTO_TEST_CASE(CategoricalCSVLoadTest04) +{ + fstream f; + f.open("test.csv", fstream::out); + f << "200-DM, 1, 1" << endl; + f << "1, 1, 1" << endl; + f << "1, 1, 1" << endl; + f << "1, 1, 1" << endl; + f.close(); + + // Load the test CSV. + arma::umat matrix; + DatasetInfo info; + data::Load("test.csv", matrix, info, true); + + BOOST_REQUIRE_EQUAL(matrix.n_cols, 4); + BOOST_REQUIRE_EQUAL(matrix.n_rows, 3); + + BOOST_REQUIRE_EQUAL(matrix(0, 0), 0); + BOOST_REQUIRE_EQUAL(matrix(0, 1), 1); + BOOST_REQUIRE_EQUAL(matrix(0, 2), 1); + BOOST_REQUIRE_EQUAL(matrix(0, 3), 1); + BOOST_REQUIRE_EQUAL(matrix(1, 0), 1); + BOOST_REQUIRE_EQUAL(matrix(1, 1), 1); + BOOST_REQUIRE_EQUAL(matrix(1, 2), 1); + BOOST_REQUIRE_EQUAL(matrix(1, 3), 1); + BOOST_REQUIRE_EQUAL(matrix(2, 0), 1); + BOOST_REQUIRE_EQUAL(matrix(2, 1), 1); + BOOST_REQUIRE_EQUAL(matrix(2, 2), 1); + BOOST_REQUIRE_EQUAL(matrix(2, 3), 1); + + BOOST_REQUIRE(info.Type(0) == Datatype::categorical); + BOOST_REQUIRE(info.Type(1) == Datatype::numeric); + BOOST_REQUIRE(info.Type(2) == Datatype::numeric); + + BOOST_REQUIRE_EQUAL(info.MapString("200-DM", 0), 0); + BOOST_REQUIRE_EQUAL(info.MapString("1", 0), 1); + + BOOST_REQUIRE_EQUAL(info.UnmapString(0, 0), "200-DM"); + BOOST_REQUIRE_EQUAL(info.UnmapString(1, 0), "1"); + + remove("test.csv"); +} + BOOST_AUTO_TEST_CASE(CategoricalNontransposedCSVLoadTest00) { fstream f; @@ -1361,6 +1405,51 @@ BOOST_AUTO_TEST_CASE(CategoricalNontransposedCSVLoadTest03) remove("test.csv"); } +BOOST_AUTO_TEST_CASE(CategoricalNontransposedCSVLoadTest04) +{ + fstream f; + f.open("test.csv", fstream::out); + f << " 200-DM , 1 , 1 " << endl; + f << " 1 , 1 , 1 " << endl; + f << " 1 , 1 , 1 " << endl; + f << " 1 , 1 , 1 " << endl; + f.close(); + + // Load the test CSV. + arma::umat matrix; + DatasetInfo info; + data::Load("test.csv", matrix, info, true, false); // No transpose. + + BOOST_REQUIRE_EQUAL(matrix.n_cols, 3); + BOOST_REQUIRE_EQUAL(matrix.n_rows, 4); + + BOOST_REQUIRE_EQUAL(matrix(0, 0), 0); + BOOST_REQUIRE_EQUAL(matrix(0, 1), 1); + BOOST_REQUIRE_EQUAL(matrix(0, 2), 1); + BOOST_REQUIRE_EQUAL(matrix(1, 0), 1); + BOOST_REQUIRE_EQUAL(matrix(1, 1), 1); + BOOST_REQUIRE_EQUAL(matrix(1, 2), 1); + BOOST_REQUIRE_EQUAL(matrix(2, 0), 1); + BOOST_REQUIRE_EQUAL(matrix(2, 1), 1); + BOOST_REQUIRE_EQUAL(matrix(2, 2), 1); + BOOST_REQUIRE_EQUAL(matrix(3, 0), 1); + BOOST_REQUIRE_EQUAL(matrix(3, 1), 1); + BOOST_REQUIRE_EQUAL(matrix(3, 2), 1); + + BOOST_REQUIRE(info.Type(0) == Datatype::categorical); + BOOST_REQUIRE(info.Type(1) == Datatype::numeric); + BOOST_REQUIRE(info.Type(2) == Datatype::numeric); + BOOST_REQUIRE(info.Type(3) == Datatype::numeric); + + BOOST_REQUIRE_EQUAL(info.MapString("200-DM", 1), 0); + BOOST_REQUIRE_EQUAL(info.MapString("1", 1), 1); + + BOOST_REQUIRE_EQUAL(info.UnmapString(0, 1), "200-DM"); + BOOST_REQUIRE_EQUAL(info.UnmapString(1, 1), "1"); + + remove("test.csv"); +} + /** * A harder test CSV based on the concerns in #658. */ From 0b20e9ac6ce4936f79308718473e6ffd1e20df59 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Tue, 7 Jun 2016 13:44:07 +0800 Subject: [PATCH 18/40] simplify parser by phrase_parse --- src/mlpack/core/data/load_csv.hpp | 60 +++++++++++++------------------ 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 968d3ce9562..0a23a97efe2 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -27,7 +27,7 @@ namespace data /** Functions to load and save matrices and models. */ { *http://theboostcpplibraries.com/boost.spirit for quick review. */ class LoadCSV -{ +{ public: explicit LoadCSV(std::string file, bool fatal = false) : extension(Extension(file)), @@ -86,8 +86,8 @@ class LoadCSV //parse % "," means you want to parse string like "1,2,3,apple"(noticed it without last comma) //qi::raw restrict the automatic conversion of boost::spirit, without it, spirit parser - //will try to convert the string to std::string, this may cause memory allocation(if small string - //optimization fail). + //will try to convert the string to std::string, this may cause memory allocation(if small string + //optimization fail). //After we wrap the parser with qi::raw, the attribute(the data accepted by functor) will //become boost::iterator_range, this could save a tons of memory allocations qi::parse(begin, end, qi::raw[*~qi::char_(",\r\n")][findColSize] % ","); @@ -182,20 +182,21 @@ class LoadCSV row)); }; - qi::rule numRule = CreateNumRule(); - qi::rule charRule = CreateCharRule(); + auto numRule = CreateNumRule(); + auto charRule = CreateCharRule(); while(std::getline(inFile, line)) { auto begin = line.begin(); //parse the numbers from a line(ex : 1,2,3,4), if the parser find the number //it will execute the setNum function - qi::parse(begin, line.end(), numRule[setNum] % ","); + qi::phrase_parse(begin, line.end(), numRule[setNum] % ",", ascii::space); if(col != inout.n_cols) { begin = line.begin(); col = 0; - const bool canParse = qi::parse(begin, line.end(), - charRule[setCharClass] % ","); + const bool canParse = qi::phrase_parse(begin, line.end(), + charRule[setCharClass] % ",", + ascii::space); if(!canParse) { throw std::runtime_error("LoadCSV cannot parse categories"); @@ -231,9 +232,6 @@ class LoadCSV { using namespace boost::spirit; - //static size_t loop = 0; - //std::cout<<"loop "< numRule = CreateNumRule(); - qi::rule charRule = CreateCharRule(); + auto numRule = CreateNumRule(); + auto charRule = CreateCharRule(); while(std::getline(inFile, line)) { auto begin = line.begin(); row = 0; progress = 0; const size_t oldSize = mapCols.size(); - //parse number of characters from a line, it will execute setNum if it is number, - //else execute setCharClass, "|" means "if not a, then b" - const bool canParse = qi::parse(begin, line.end(), - (numRule[setNum] | (charRule)[setCharClass]) % ","); - //std::cout< - boost::spirit::qi::rule CreateNumRule() const + boost::spirit::qi::rule + CreateNumRule() const { using namespace boost::spirit; @@ -319,40 +314,35 @@ class LoadCSV //qi::omit can omit the attributes of spirit, every parser of spirit //has attribute(the type will pass into actions(functor)) //if you do not omit it, the attribute combine with attribute may - //change the attribute + //change the attribute //input like 2-200 or 2DM will make the parser fail, //so we use "look ahead parser--&" to make sure next //character is "," or end of line(eof) or end of file(eoi) //looks ahead parser will not consume any input or generate //any attribute - - //"-" means one or zero(same as "-" of EBNF) if(extension == "csv" || extension == "txt") { - return qi::skip(qi::char_(" "))[elemParser] >> -qi::omit[*qi::char_(" ")] - >> &(qi::lit(",") | qi::eol | qi::eoi); + return elemParser >> &(qi::lit(",") | qi::eol | qi::eoi); } else { - return qi::skip(qi::char_(" "))[elemParser] >> -qi::omit[*qi::char_(" ")] - >> &(qi::lit("\t") | qi::eol | qi::eoi); + return elemParser >> &(qi::lit("\t") | qi::eol | qi::eoi); } } - boost::spirit::qi::rule CreateCharRule() const + boost::spirit::qi::rule + CreateCharRule() const { using namespace boost::spirit; if(extension == "csv" || extension == "txt") { - return -qi::omit[*qi::char_(" ")] >> qi::raw[*~qi::char_(" ,\r\n")] - >> -qi::omit[*qi::char_(" ")]; + return qi::raw[*~qi::char_(" ,\r\n")]; } else { - return -qi::omit[*qi::char_(" ")] >> qi::raw[*~qi::char_(" \t\r\n")] - >> -qi::omit[*qi::char_(" ")]; + return qi::raw[*~qi::char_(" \t\r\n")]; } } From 0a04ec482ffda99a37d303c23fd01d98e61f8bbd Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Tue, 7 Jun 2016 13:51:24 +0800 Subject: [PATCH 19/40] simnplify parser and refine format --- src/mlpack/core/data/load_csv.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 0a23a97efe2..c80fee010f0 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -87,7 +87,7 @@ class LoadCSV //qi::raw restrict the automatic conversion of boost::spirit, without it, spirit parser //will try to convert the string to std::string, this may cause memory allocation(if small string - //optimization fail). + //optimization fail). //After we wrap the parser with qi::raw, the attribute(the data accepted by functor) will //become boost::iterator_range, this could save a tons of memory allocations qi::parse(begin, end, qi::raw[*~qi::char_(",\r\n")][findColSize] % ","); @@ -338,11 +338,11 @@ class LoadCSV if(extension == "csv" || extension == "txt") { - return qi::raw[*~qi::char_(" ,\r\n")]; + return qi::raw[*~qi::char_(",\r\n")]; } else { - return qi::raw[*~qi::char_(" \t\r\n")]; + return qi::raw[*~qi::char_("\t\r\n")]; } } From 8866942379c4a24e92b61fc79eced93602b30b94 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Wed, 8 Jun 2016 12:47:29 +0800 Subject: [PATCH 20/40] add forward declaration for DatasetInfo, wihtout it the vc2015 compiler cannot find the declaration, weird --- src/mlpack/core/data/load.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 19e238a9403..c8decf326aa 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -19,6 +19,8 @@ namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { +class DatasetInfo; + /** * Loads a matrix from file, guessing the filetype from the extension. This * will transpose the matrix at load time (unless the transpose parameter is set From 0ceba31326b338c5b55af3c3a353e58006fcf68d Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Wed, 8 Jun 2016 12:48:35 +0800 Subject: [PATCH 21/40] add load_csv.hpp and load_csv.cpp --- src/mlpack/core/data/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index ea87d0f13ab..524057d902f 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -6,6 +6,8 @@ set(SOURCES extension.hpp format.hpp load.hpp + load_csv.hpp + load_csv.cpp load_impl.hpp load_arff.hpp load_arff_impl.hpp From 7cc597f14b52698586bc6b40bc4682669107d074 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Wed, 8 Jun 2016 12:49:04 +0800 Subject: [PATCH 22/40] split part of the implementation details into cpp, this may reduce some compile times --- src/mlpack/core/data/load_csv.cpp | 102 ++++++++++++++++++++++++++++++ src/mlpack/core/data/load_csv.hpp | 94 ++------------------------- 2 files changed, 107 insertions(+), 89 deletions(-) create mode 100644 src/mlpack/core/data/load_csv.cpp diff --git a/src/mlpack/core/data/load_csv.cpp b/src/mlpack/core/data/load_csv.cpp new file mode 100644 index 00000000000..f4f12a4afb4 --- /dev/null +++ b/src/mlpack/core/data/load_csv.cpp @@ -0,0 +1,102 @@ +#include "load_csv.hpp" + +namespace mlpack { +namespace data { + +LoadCSV::LoadCSV(std::string file, bool fatal) : + extension(Extension(file)), + fatalIfOpenFail(fatal), + fileName(std::move(file)), + inFile(fileName) +{ + CanOpen(); +} + +bool LoadCSV::CanOpen() +{ + if(!inFile.is_open()) + { + if(fatalIfOpenFail) + { + Log::Fatal << "Cannot open file '" << fileName << "'. " << std::endl; + } + else + { + Log::Warn << "Cannot open file '" << fileName << "'; load failed." + << std::endl; + } + return false; + } + inFile.unsetf(std::ios::skipws); + + return true; +} + +size_t LoadCSV::ColSize() +{ + //boost tokenizer or strtok can do the same thing, I use + //spirit at here because I think this is a nice example + using namespace boost::spirit; + using bsi_type = boost::spirit::istream_iterator; + using iter_type = boost::iterator_range; + + inFile.clear(); + inFile.seekg(0, std::ios::beg); + //spirit::qi requires iterators to be atleast forward iterators, + //but std::istream_iterator is input iteraotr, so we use + //boost::spirit::istream_iterator to overcome this problem + bsi_type begin(inFile); + bsi_type end; + size_t col = 0; + + //the parser of boost spirit can work with "actions"(functor) + //when the parser find match target, this functor will be executed + auto findColSize = [&col](iter_type){ ++col; }; + + //qi::char_ bite an character + //qi::char_(",\r\n") only bite a "," or "\r" or "\n" character + //* means the parser(ex : qi::char_) can bite [0, any size] of characters + //~ means negate, so ~qi::char_(",\r\n") means I want to bite anything except of ",\r\n" + //parse % "," means you want to parse string like "1,2,3,apple"(noticed it without last comma) + + //qi::raw restrict the automatic conversion of boost::spirit, without it, spirit parser + //will try to convert the string to std::string, this may cause memory allocation(if small string + //optimization fail). + //After we wrap the parser with qi::raw, the attribute(the data accepted by functor) will + //become boost::iterator_range, this could save a tons of memory allocations + qi::parse(begin, end, qi::raw[*~qi::char_(",\r\n")][findColSize] % ","); + + return col; +} + +size_t LoadCSV::RowSize() +{ + inFile.clear(); + inFile.seekg(0, std::ios::beg); + size_t row = 0; + std::string line; + while(std::getline(inFile, line)) + { + ++row; + } + + return row; +} + +boost::spirit::qi::rule +LoadCSV::CreateCharRule() const +{ + using namespace boost::spirit; + + if(extension == "csv" || extension == "txt") + { + return qi::raw[*~qi::char_(",\r\n")]; + } + else + { + return qi::raw[*~qi::char_("\t\r\n")]; + } +} + +} // namespace data +} // namespace mlpack diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index c80fee010f0..0a16961cb33 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -29,14 +29,7 @@ namespace data /** Functions to load and save matrices and models. */ { class LoadCSV { public: - explicit LoadCSV(std::string file, bool fatal = false) : - extension(Extension(file)), - fatalIfOpenFail(fatal), - fileName(std::move(file)), - inFile(fileName) - { - CanOpen(); - } + explicit LoadCSV(std::string file, bool fatal = false); template void Load(arma::Mat &inout, DatasetInfo &infoSet, bool transpose = true) @@ -58,56 +51,9 @@ class LoadCSV } } - size_t ColSize() - { - //boost tokenizer or strtok can do the same thing, I use - //spirit at here because I think this is a nice example - using namespace boost::spirit; - using bsi_type = boost::spirit::istream_iterator; - using iter_type = boost::iterator_range; - - inFile.clear(); - inFile.seekg(0, std::ios::beg); - //spirit::qi requires iterators to be atleast forward iterators, - //but std::istream_iterator is input iteraotr, so we use - //boost::spirit::istream_iterator to overcome this problem - bsi_type begin(inFile); - bsi_type end; - size_t col = 0; - - //the parser of boost spirit can work with "actions"(functor) - //when the parser find match target, this functor will be executed - auto findColSize = [&col](iter_type){ ++col; }; - - //qi::char_ bite an character - //qi::char_(",\r\n") only bite a "," or "\r" or "\n" character - //* means the parser(ex : qi::char_) can bite [0, any size] of characters - //~ means negate, so ~qi::char_(",\r\n") means I want to bite anything except of ",\r\n" - //parse % "," means you want to parse string like "1,2,3,apple"(noticed it without last comma) - - //qi::raw restrict the automatic conversion of boost::spirit, without it, spirit parser - //will try to convert the string to std::string, this may cause memory allocation(if small string - //optimization fail). - //After we wrap the parser with qi::raw, the attribute(the data accepted by functor) will - //become boost::iterator_range, this could save a tons of memory allocations - qi::parse(begin, end, qi::raw[*~qi::char_(",\r\n")][findColSize] % ","); - - return col; - } - - size_t RowSize() - { - inFile.clear(); - inFile.seekg(0, std::ios::beg); - size_t row = 0; - std::string line; - while(std::getline(inFile, line)) - { - ++row; - } + size_t ColSize(); - return row; - } + size_t RowSize(); private: using iter_type = boost::iterator_range; @@ -133,25 +79,7 @@ class LoadCSV } }; - bool CanOpen() - { - if(!inFile.is_open()) - { - if(fatalIfOpenFail) - { - Log::Fatal << "Cannot open file '" << fileName << "'. " << std::endl; - } - else - { - Log::Warn << "Cannot open file '" << fileName << "'; load failed." - << std::endl; - } - return false; - } - inFile.unsetf(std::ios::skipws); - - return true; - } + bool CanOpen(); template void NonTranposeParse(arma::Mat &inout, DatasetInfo &infoSet) @@ -332,19 +260,7 @@ class LoadCSV } boost::spirit::qi::rule - CreateCharRule() const - { - using namespace boost::spirit; - - if(extension == "csv" || extension == "txt") - { - return qi::raw[*~qi::char_(",\r\n")]; - } - else - { - return qi::raw[*~qi::char_("\t\r\n")]; - } - } + CreateCharRule() const; std::string extension; bool fatalIfOpenFail; From 1bdaf1a60c57368ee8c336e0c6862f166f379be3 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Wed, 8 Jun 2016 14:31:44 +0800 Subject: [PATCH 23/40] remove forward declaration --- src/mlpack/core/data/load.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index c8decf326aa..19e238a9403 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -19,8 +19,6 @@ namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { -class DatasetInfo; - /** * Loads a matrix from file, guessing the filetype from the extension. This * will transpose the matrix at load time (unless the transpose parameter is set From 4435829eeb3a87bcc732fa9b1db81c887cbc3cea Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Wed, 8 Jun 2016 14:32:53 +0800 Subject: [PATCH 24/40] include mlpack/core.hpp before arma_extend.hpp to prevent some weird bugs --- src/mlpack/core/data/load_csv.hpp | 5 +++-- src/mlpack/core/data/load_impl.hpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 0a16961cb33..bcb3af1c3d2 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -7,11 +7,12 @@ #ifndef MLPACK_CORE_DATA_LOAD_CSV_HPP #define MLPACK_CORE_DATA_LOAD_CSV_HPP -#include - +#include #include #include // Includes Armadillo. +#include + #include #include diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index eaddb95b6df..4ba5e1fc608 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -9,8 +9,8 @@ // In case it hasn't already been included. #include "load.hpp" -#include "extension.hpp" #include "load_csv.hpp" +#include "extension.hpp" #include #include From 2264bb4219d431dad990c73582328feedffdff8b Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Wed, 8 Jun 2016 15:36:11 +0800 Subject: [PATCH 25/40] add header load_csv.hpp --- src/mlpack/core/data/load.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 19e238a9403..d846d32f43d 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -15,6 +15,7 @@ #include "format.hpp" #include "dataset_info.hpp" +#include "load_csv.hpp" namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { From baa1a64a951d5e795ee3560b4c47335c1e341580 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Wed, 8 Jun 2016 22:24:58 +0800 Subject: [PATCH 26/40] change order of sources --- src/mlpack/core/data/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index 524057d902f..5b415e90bd8 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -5,9 +5,9 @@ set(SOURCES dataset_info_impl.hpp extension.hpp format.hpp - load.hpp load_csv.hpp load_csv.cpp + load.hpp load_impl.hpp load_arff.hpp load_arff_impl.hpp From 1a70984cde97fb30d1b48a04683312e80ce7c99d Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Wed, 8 Jun 2016 22:43:08 +0800 Subject: [PATCH 27/40] remove useless include file --- src/mlpack/core/data/load_csv.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index bcb3af1c3d2..30ece1c52d8 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -9,7 +9,6 @@ #include #include -#include // Includes Armadillo. #include From f19c11a38500a3a052d794c8d970d231e2086217 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Wed, 8 Jun 2016 22:47:46 +0800 Subject: [PATCH 28/40] change order of header file --- src/mlpack/core/data/load_csv.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 30ece1c52d8..f0a0b38d52a 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -7,12 +7,12 @@ #ifndef MLPACK_CORE_DATA_LOAD_CSV_HPP #define MLPACK_CORE_DATA_LOAD_CSV_HPP +#include + #include #include -#include - -#include +#include #include #include "format.hpp" From 3beb8909d027921af9dd779f931fdbddbce32c18 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Wed, 29 Jun 2016 05:50:20 +0800 Subject: [PATCH 29/40] move implementation details from cpp back to hpp --- src/mlpack/core/data/CMakeLists.txt | 1 - src/mlpack/core/data/load_csv.hpp | 95 +++++++++++++++++++++++++++-- 2 files changed, 90 insertions(+), 6 deletions(-) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index 5b415e90bd8..0a1395c6cf9 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -6,7 +6,6 @@ set(SOURCES extension.hpp format.hpp load_csv.hpp - load_csv.cpp load.hpp load_impl.hpp load_arff.hpp diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index f0a0b38d52a..58f3043b742 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -15,6 +15,7 @@ #include #include +#include "extension.hpp" #include "format.hpp" #include "dataset_info.hpp" @@ -29,7 +30,14 @@ namespace data /** Functions to load and save matrices and models. */ { class LoadCSV { public: - explicit LoadCSV(std::string file, bool fatal = false); + explicit LoadCSV(std::string file, bool fatal = false) : + extension(Extension(file)), + fatalIfOpenFail(fatal), + fileName(std::move(file)), + inFile(fileName) + { + CanOpen(); + } template void Load(arma::Mat &inout, DatasetInfo &infoSet, bool transpose = true) @@ -51,9 +59,56 @@ class LoadCSV } } - size_t ColSize(); + size_t ColSize() + { + //boost tokenizer or strtok can do the same thing, I use + //spirit at here because I think this is a nice example + using namespace boost::spirit; + using bsi_type = boost::spirit::istream_iterator; + using iter_type = boost::iterator_range; + + inFile.clear(); + inFile.seekg(0, std::ios::beg); + //spirit::qi requires iterators to be atleast forward iterators, + //but std::istream_iterator is input iteraotr, so we use + //boost::spirit::istream_iterator to overcome this problem + bsi_type begin(inFile); + bsi_type end; + size_t col = 0; + + //the parser of boost spirit can work with "actions"(functor) + //when the parser find match target, this functor will be executed + auto findColSize = [&col](iter_type){ ++col; }; + + //qi::char_ bite an character + //qi::char_(",\r\n") only bite a "," or "\r" or "\n" character + //* means the parser(ex : qi::char_) can bite [0, any size] of characters + //~ means negate, so ~qi::char_(",\r\n") means I want to bite anything except of ",\r\n" + //parse % "," means you want to parse string like "1,2,3,apple"(noticed it without last comma) + + //qi::raw restrict the automatic conversion of boost::spirit, without it, spirit parser + //will try to convert the string to std::string, this may cause memory allocation(if small string + //optimization fail). + //After we wrap the parser with qi::raw, the attribute(the data accepted by functor) will + //become boost::iterator_range, this could save a tons of memory allocations + qi::parse(begin, end, qi::raw[*~qi::char_(",\r\n")][findColSize] % ","); + + return col; + } + + size_t RowSize() + { + inFile.clear(); + inFile.seekg(0, std::ios::beg); + size_t row = 0; + std::string line; + while(std::getline(inFile, line)) + { + ++row; + } - size_t RowSize(); + return row; + } private: using iter_type = boost::iterator_range; @@ -79,7 +134,25 @@ class LoadCSV } }; - bool CanOpen(); + bool CanOpen() + { + if(!inFile.is_open()) + { + if(fatalIfOpenFail) + { + Log::Fatal << "Cannot open file '" << fileName << "'. " << std::endl; + } + else + { + Log::Warn << "Cannot open file '" << fileName << "'; load failed." + << std::endl; + } + return false; + } + inFile.unsetf(std::ios::skipws); + + return true; + } template void NonTranposeParse(arma::Mat &inout, DatasetInfo &infoSet) @@ -260,7 +333,19 @@ class LoadCSV } boost::spirit::qi::rule - CreateCharRule() const; + CreateCharRule() const + { + using namespace boost::spirit; + + if(extension == "csv" || extension == "txt") + { + return qi::raw[*~qi::char_(",\r\n")]; + } + else + { + return qi::raw[*~qi::char_("\t\r\n")]; + } + } std::string extension; bool fatalIfOpenFail; From 3df712b83bc45d55aff895863615ce16b4a6d7b5 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 12 Feb 2017 18:06:00 +0800 Subject: [PATCH 30/40] move part of the implementation details to cpp --- src/mlpack/core/data/load_csv.hpp | 130 +++++------------------------- 1 file changed, 21 insertions(+), 109 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 58f3043b742..21f9b287ea7 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -17,10 +17,10 @@ #include "extension.hpp" #include "format.hpp" -#include "dataset_info.hpp" +#include "dataset_mapper.hpp" namespace mlpack { -namespace data /** Functions to load and save matrices and models. */ { +namespace data { /** *Load the csv file.This class use boost::spirit @@ -30,25 +30,16 @@ namespace data /** Functions to load and save matrices and models. */ { class LoadCSV { public: - explicit LoadCSV(std::string file, bool fatal = false) : - extension(Extension(file)), - fatalIfOpenFail(fatal), - fileName(std::move(file)), - inFile(fileName) - { - CanOpen(); - } + explicit LoadCSV(std::string file, bool fatal = false); - template - void Load(arma::Mat &inout, DatasetInfo &infoSet, bool transpose = true) + template + void Load(arma::Mat &inout, DatasetMapper &infoSet, bool transpose = true) { if(!CanOpen()) { return; } - //please refer to the comments of ColSize if you do not familiar - //with boost::spirit yet if(transpose) { TranposeParse(inout, infoSet); @@ -59,56 +50,8 @@ class LoadCSV } } - size_t ColSize() - { - //boost tokenizer or strtok can do the same thing, I use - //spirit at here because I think this is a nice example - using namespace boost::spirit; - using bsi_type = boost::spirit::istream_iterator; - using iter_type = boost::iterator_range; - - inFile.clear(); - inFile.seekg(0, std::ios::beg); - //spirit::qi requires iterators to be atleast forward iterators, - //but std::istream_iterator is input iteraotr, so we use - //boost::spirit::istream_iterator to overcome this problem - bsi_type begin(inFile); - bsi_type end; - size_t col = 0; - - //the parser of boost spirit can work with "actions"(functor) - //when the parser find match target, this functor will be executed - auto findColSize = [&col](iter_type){ ++col; }; - - //qi::char_ bite an character - //qi::char_(",\r\n") only bite a "," or "\r" or "\n" character - //* means the parser(ex : qi::char_) can bite [0, any size] of characters - //~ means negate, so ~qi::char_(",\r\n") means I want to bite anything except of ",\r\n" - //parse % "," means you want to parse string like "1,2,3,apple"(noticed it without last comma) - - //qi::raw restrict the automatic conversion of boost::spirit, without it, spirit parser - //will try to convert the string to std::string, this may cause memory allocation(if small string - //optimization fail). - //After we wrap the parser with qi::raw, the attribute(the data accepted by functor) will - //become boost::iterator_range, this could save a tons of memory allocations - qi::parse(begin, end, qi::raw[*~qi::char_(",\r\n")][findColSize] % ","); - - return col; - } - - size_t RowSize() - { - inFile.clear(); - inFile.seekg(0, std::ios::beg); - size_t row = 0; - std::string line; - while(std::getline(inFile, line)) - { - ++row; - } - - return row; - } + size_t ColSize(); + size_t RowSize(); private: using iter_type = boost::iterator_range; @@ -134,34 +77,16 @@ class LoadCSV } }; - bool CanOpen() - { - if(!inFile.is_open()) - { - if(fatalIfOpenFail) - { - Log::Fatal << "Cannot open file '" << fileName << "'. " << std::endl; - } - else - { - Log::Warn << "Cannot open file '" << fileName << "'; load failed." - << std::endl; - } - return false; - } - inFile.unsetf(std::ios::skipws); - - return true; - } + bool CanOpen(); - template - void NonTranposeParse(arma::Mat &inout, DatasetInfo &infoSet) + template + void NonTranposeParse(arma::Mat &inout, DatasetMapper &infoSet) { using namespace boost::spirit; size_t row = 0; size_t col = 0; - infoSet = DatasetInfo(RowSize()); + infoSet = DatasetMapper(RowSize()); std::string line; inout.set_size(infoSet.Dimensionality(), ColSize()); inFile.clear(); @@ -208,18 +133,17 @@ class LoadCSV } } - template - void TranposeParse(arma::Mat &inout, DatasetInfo &infoSet) + template + void TranposeParse(arma::Mat &inout, DatasetMapper &infoSet) { - infoSet = DatasetInfo(ColSize()); + infoSet = DatasetMapper(ColSize()); inout.set_size(infoSet.Dimensionality(), RowSize()); size_t parseTime = 0; std::set mapCols; while(!TranposeParseImpl(inout, infoSet, mapCols)) - { - //avoid infinite loop - ++parseTime; - infoSet = DatasetInfo(inout.n_rows); + { + ++parseTime; //avoid infinite loop + infoSet = DatasetMapper(inout.n_rows); if(parseTime == inout.n_rows) { return; @@ -227,8 +151,8 @@ class LoadCSV } } - template - bool TranposeParseImpl(arma::Mat &inout, DatasetInfo &infoSet, + template + bool TranposeParseImpl(arma::Mat &inout, DatasetMapper &infoSet, std::set &mapCols) { using namespace boost::spirit; @@ -310,7 +234,7 @@ class LoadCSV auto elemParser = ElemParser::Parser(); //qi::skip can specify which characters you want to skip, //in this example, elemParser will parse int or double value, - //but we do not want space to intefere it, so we skip it by qi::skip + //we use qi::skip to skip space //qi::omit can omit the attributes of spirit, every parser of spirit //has attribute(the type will pass into actions(functor)) @@ -333,19 +257,7 @@ class LoadCSV } boost::spirit::qi::rule - CreateCharRule() const - { - using namespace boost::spirit; - - if(extension == "csv" || extension == "txt") - { - return qi::raw[*~qi::char_(",\r\n")]; - } - else - { - return qi::raw[*~qi::char_("\t\r\n")]; - } - } + CreateCharRule() const; std::string extension; bool fatalIfOpenFail; From 1bf513a50fba32b864e55475aef9b52bd18700da Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 12 Feb 2017 18:06:48 +0800 Subject: [PATCH 31/40] 1 : use extern template to export part of the implementation of Load function 2 : use LoadCSV to relpace implementation of the part of csv loading --- src/mlpack/core/data/CMakeLists.txt | 2 + src/mlpack/core/data/load.cpp | 18 ++++++ src/mlpack/core/data/load.hpp | 13 ++++ src/mlpack/core/data/load_impl.hpp | 99 ++--------------------------- 4 files changed, 40 insertions(+), 92 deletions(-) create mode 100644 src/mlpack/core/data/load.cpp diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index be64aace056..bc272272f39 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -6,8 +6,10 @@ set(SOURCES extension.hpp format.hpp load_csv.hpp + load_csv.cpp load.hpp load_impl.hpp + load.cpp load_arff.hpp load_arff_impl.hpp normalize_labels.hpp diff --git a/src/mlpack/core/data/load.cpp b/src/mlpack/core/data/load.cpp new file mode 100644 index 00000000000..3170f4ce81e --- /dev/null +++ b/src/mlpack/core/data/load.cpp @@ -0,0 +1,18 @@ +#include "load.hpp" + +namespace mlpack { +namespace data /** Functions to load and save matrices and models. */ { + +/*template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +const bool fatal, const bool transpose); + +template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +const bool fatal, const bool transpose); + +template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +const bool fatal, const bool transpose); + +template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +const bool fatal, const bool transpose);//*/ + +}} diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 7302957e54a..3a5877b009b 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -18,6 +18,7 @@ #include // Includes Armadillo. #include +#include "load.hpp" #include "format.hpp" #include "dataset_mapper.hpp" @@ -103,6 +104,18 @@ bool Load(const std::string& filename, const bool fatal = false, const bool transpose = true); +/*extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +const bool fatal, const bool transpose); + +extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +const bool fatal, const bool transpose); + +extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +const bool fatal, const bool transpose); + +extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +const bool fatal, const bool transpose);//*/ + /** * Load a model from a file, guessing the filetype from the extension, or, * optionally, loading the specified format. If automatic extension detection diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 9bde286cadf..985db4c33b6 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -13,12 +13,14 @@ #define MLPACK_CORE_DATA_LOAD_IMPL_HPP // In case it hasn't already been included. -#include "load.hpp" -#include "extension.hpp" #include #include +#include "load_csv.hpp" +#include "load.hpp" +#include "extension.hpp" + #include #include #include @@ -64,7 +66,7 @@ void TransposeTokens(std::vector> const &input, } } -} +} //namespace details template bool inline inplace_transpose(arma::Mat& X) @@ -369,95 +371,8 @@ bool Load(const std::string& filename, if (extension == "csv" || extension == "tsv" || extension == "txt") { - // True if we're looking for commas; if false, we're looking for spaces. - bool commas = (extension == "csv"); - - std::string type; - if (extension == "csv") - type = "CSV data"; - else - type = "raw ASCII-formatted data"; - - Log::Info << "Loading '" << filename << "' as " << type << ". " - << std::flush; - std::string separators; - if (commas) - separators = ","; - else - separators = " \t"; - - // We'll load this as CSV (or CSV with spaces or tabs) according to - // RFC4180. So the first thing to do is determine the size of the matrix. - std::string buffer; - size_t cols = 0; - - std::getline(stream, buffer, '\n'); - // Count commas and whitespace in the line, ignoring anything inside - // quotes. - typedef boost::tokenizer> Tokenizer; - boost::escaped_list_separator sep("\\", separators, "\""); - Tokenizer tok(buffer, sep); - for (Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i) - ++cols; - - // Now count the number of lines in the file. We've already counted the - // first one. - size_t rows = 1; - while (!stream.eof() && !stream.bad() && !stream.fail()) - { - std::getline(stream, buffer, '\n'); - if (!stream.fail()) - ++rows; - } - - // Now we have the size. So resize our matrix. - if (transpose) - { - matrix.set_size(cols, rows); - info = DatasetMapper(info.Policy(), cols); - } - else - { - matrix.set_size(rows, cols); - info = DatasetMapper(info.Policy(), rows); - } - - stream.close(); - stream.open(filename, std::fstream::in); - - if (transpose) - { - std::vector> tokensArray; - std::vector tokens; - while (!stream.bad() && !stream.fail() && !stream.eof()) - { - // Extract line by line. - std::getline(stream, buffer, '\n'); - Tokenizer lineTok(buffer, sep); - tokens = details::ToTokens(lineTok); - if (tokens.size() == cols) - { - tokensArray.emplace_back(std::move(tokens)); - } - } - for(size_t i = 0; i != cols; ++i) - { - details::TransposeTokens(tokensArray, tokens, i); - info.MapTokens(tokens, i, matrix); - } - } - else - { - size_t row = 0; - while (!stream.bad() && !stream.fail() && !stream.eof()) - { - // Extract line by line. - std::getline(stream, buffer, '\n'); - Tokenizer lineTok(buffer, sep); - info.MapTokens(details::ToTokens(lineTok), row, matrix); - ++row; - } - } + LoadCSV loader(filename, fatal); + loader.Load(matrix, info, transpose); } else if (extension == "arff") { From c42372e21c0e98a7bca6e326e7a9c987ccecafdf Mon Sep 17 00:00:00 2001 From: Ryan Curtin Date: Tue, 14 Feb 2017 13:43:35 -0500 Subject: [PATCH 32/40] Fix Armadillo warning. --- src/mlpack/core/data/load.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 3a5877b009b..534c687fecb 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -14,11 +14,10 @@ #ifndef MLPACK_CORE_DATA_LOAD_HPP #define MLPACK_CORE_DATA_LOAD_HPP +#include #include -#include // Includes Armadillo. #include -#include "load.hpp" #include "format.hpp" #include "dataset_mapper.hpp" From cc5d541ebb7a2a01f136c11165f26680a9242485 Mon Sep 17 00:00:00 2001 From: Ryan Curtin Date: Tue, 14 Feb 2017 14:02:44 -0500 Subject: [PATCH 33/40] Use extern templates to compile Load() overloads, so that spirit doesn't get included. --- src/mlpack/core/data/CMakeLists.txt | 3 +- src/mlpack/core/data/load.cpp | 14 ++- src/mlpack/core/data/load.hpp | 17 +++- src/mlpack/core/data/load_impl.hpp | 91 ------------------- .../preprocess/preprocess_imputer_main.cpp | 1 + src/mlpack/tests/imputation_test.cpp | 1 + src/mlpack/tests/load_save_test.cpp | 1 + 7 files changed, 30 insertions(+), 98 deletions(-) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index bc272272f39..87a5d9cc228 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -7,7 +7,8 @@ set(SOURCES format.hpp load_csv.hpp load_csv.cpp - load.hpp + load.hpp + load_model_impl.hpp load_impl.hpp load.cpp load_arff.hpp diff --git a/src/mlpack/core/data/load.cpp b/src/mlpack/core/data/load.cpp index 3170f4ce81e..f3167669921 100644 --- a/src/mlpack/core/data/load.cpp +++ b/src/mlpack/core/data/load.cpp @@ -1,9 +1,16 @@ #include "load.hpp" +#include "load_impl.hpp" namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { -/*template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +template bool Load(const std::string&, arma::Mat&, const bool, const bool); +template bool Load(const std::string&, arma::Mat&, const bool, const bool); +template bool Load(const std::string&, arma::Mat&, const bool, const bool); +template bool Load(const std::string&, arma::Mat&, const bool, const bool); +template bool Load(const std::string&, arma::Mat&, const bool, const bool); + +template bool Load(std::string const&, arma::Mat&, DatasetMapper&, const bool fatal, const bool transpose); template bool Load(std::string const&, arma::Mat&, DatasetMapper&, @@ -13,6 +20,9 @@ template bool Load(std::string const&, arma::Mat& const bool fatal, const bool transpose); template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose);//*/ +const bool fatal, const bool transpose); + +template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +const bool fatal, const bool transpose); }} diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 534c687fecb..df72d8314a5 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -64,6 +64,12 @@ bool Load(const std::string& filename, const bool fatal = false, const bool transpose = true); +extern template bool Load(const std::string&, arma::Mat&, const bool, const bool); +extern template bool Load(const std::string&, arma::Mat&, const bool, const bool); +extern template bool Load(const std::string&, arma::Mat&, const bool, const bool); +extern template bool Load(const std::string&, arma::Mat&, const bool, const bool); +extern template bool Load(const std::string&, arma::Mat&, const bool, const bool); + /** * Loads a matrix from a file, guessing the filetype from the extension and * mapping categorical features with a DatasetMapper object. This will @@ -103,7 +109,7 @@ bool Load(const std::string& filename, const bool fatal = false, const bool transpose = true); -/*extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, const bool fatal, const bool transpose); extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, @@ -113,7 +119,10 @@ extern template bool Load(std::string const&, arma::Mat< const bool fatal, const bool transpose); extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose);//*/ +const bool fatal, const bool transpose); + +extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, +const bool fatal, const bool transpose); /** * Load a model from a file, guessing the filetype from the extension, or, @@ -150,7 +159,7 @@ bool Load(const std::string& filename, } // namespace data } // namespace mlpack -// Include implementation. -#include "load_impl.hpp" +// Include implementation of model-loading Load() overload. +#include "load_model_impl.hpp" #endif diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 985db4c33b6..e44fa4fe39e 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -21,16 +21,10 @@ #include "load.hpp" #include "extension.hpp" -#include #include -#include -#include -#include #include #include -#include "serialization_shim.hpp" - #include "load_arff.hpp" namespace mlpack { @@ -416,91 +410,6 @@ bool Load(const std::string& filename, return true; } -// Load a model from file. -template -bool Load(const std::string& filename, - const std::string& name, - T& t, - const bool fatal, - format f) -{ - if (f == format::autodetect) - { - std::string extension = Extension(filename); - - if (extension == "xml") - f = format::xml; - else if (extension == "bin") - f = format::binary; - else if (extension == "txt") - f = format::text; - else - { - if (fatal) - Log::Fatal << "Unable to detect type of '" << filename << "'; incorrect" - << " extension?" << std::endl; - else - Log::Warn << "Unable to detect type of '" << filename << "'; load " - << "failed. Incorrect extension?" << std::endl; - - return false; - } - } - - // Now load the given format. - std::ifstream ifs; -#ifdef _WIN32 // Open non-text in binary mode on Windows. - if (f == format::binary) - ifs.open(filename, std::ifstream::in | std::ifstream::binary); - else - ifs.open(filename, std::ifstream::in); -#else - ifs.open(filename, std::ifstream::in); -#endif - - if (!ifs.is_open()) - { - if (fatal) - Log::Fatal << "Unable to open file '" << filename << "' to load object '" - << name << "'." << std::endl; - else - Log::Warn << "Unable to open file '" << filename << "' to load object '" - << name << "'." << std::endl; - - return false; - } - - try - { - if (f == format::xml) - { - boost::archive::xml_iarchive ar(ifs); - ar >> CreateNVP(t, name); - } - else if (f == format::text) - { - boost::archive::text_iarchive ar(ifs); - ar >> CreateNVP(t, name); - } - else if (f == format::binary) - { - boost::archive::binary_iarchive ar(ifs); - ar >> CreateNVP(t, name); - } - - return true; - } - catch (boost::archive::archive_exception& e) - { - if (fatal) - Log::Fatal << e.what() << std::endl; - else - Log::Warn << e.what() << std::endl; - - return false; - } -} - } // namespace data } // namespace mlpack diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp index 920dad811e0..267f3df7d74 100644 --- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp +++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp @@ -11,6 +11,7 @@ * http://www.opensource.org/licenses/BSD-3-Clause for more information. */ #include +#include #include #include #include diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp index 2e9f8cf34cc..9fcd26a2d72 100644 --- a/src/mlpack/tests/imputation_test.cpp +++ b/src/mlpack/tests/imputation_test.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp index ac1ca62df40..b4d7eb953d1 100644 --- a/src/mlpack/tests/load_save_test.cpp +++ b/src/mlpack/tests/load_save_test.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include "test_tools.hpp" From 436bc2c55fd4ea5e31761feda797425c73a26c7f Mon Sep 17 00:00:00 2001 From: Ryan Curtin Date: Tue, 14 Feb 2017 14:12:49 -0500 Subject: [PATCH 34/40] Add new file. --- src/mlpack/core/data/load_model_impl.hpp | 124 +++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 src/mlpack/core/data/load_model_impl.hpp diff --git a/src/mlpack/core/data/load_model_impl.hpp b/src/mlpack/core/data/load_model_impl.hpp new file mode 100644 index 00000000000..556082c7d0a --- /dev/null +++ b/src/mlpack/core/data/load_model_impl.hpp @@ -0,0 +1,124 @@ +/** + * @file load_model_impl.hpp + * @author Ryan Curtin + * + * Implementation of model-specific Load() function. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef MLPACK_CORE_DATA_LOAD_MODEL_IMPL_HPP +#define MLPACK_CORE_DATA_LOAD_MODEL_IMPL_HPP + +// In case it hasn't already been included. +#include "load.hpp" + +#include +#include + +#include "extension.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "serialization_shim.hpp" + +namespace mlpack { +namespace data { + +// Load a model from file. +template +bool Load(const std::string& filename, + const std::string& name, + T& t, + const bool fatal, + format f) +{ + if (f == format::autodetect) + { + std::string extension = Extension(filename); + + if (extension == "xml") + f = format::xml; + else if (extension == "bin") + f = format::binary; + else if (extension == "txt") + f = format::text; + else + { + if (fatal) + Log::Fatal << "Unable to detect type of '" << filename << "'; incorrect" + << " extension?" << std::endl; + else + Log::Warn << "Unable to detect type of '" << filename << "'; load " + << "failed. Incorrect extension?" << std::endl; + + return false; + } + } + + // Now load the given format. + std::ifstream ifs; +#ifdef _WIN32 // Open non-text in binary mode on Windows. + if (f == format::binary) + ifs.open(filename, std::ifstream::in | std::ifstream::binary); + else + ifs.open(filename, std::ifstream::in); +#else + ifs.open(filename, std::ifstream::in); +#endif + + if (!ifs.is_open()) + { + if (fatal) + Log::Fatal << "Unable to open file '" << filename << "' to load object '" + << name << "'." << std::endl; + else + Log::Warn << "Unable to open file '" << filename << "' to load object '" + << name << "'." << std::endl; + + return false; + } + + try + { + if (f == format::xml) + { + boost::archive::xml_iarchive ar(ifs); + ar >> CreateNVP(t, name); + } + else if (f == format::text) + { + boost::archive::text_iarchive ar(ifs); + ar >> CreateNVP(t, name); + } + else if (f == format::binary) + { + boost::archive::binary_iarchive ar(ifs); + ar >> CreateNVP(t, name); + } + + return true; + } + catch (boost::archive::archive_exception& e) + { + if (fatal) + Log::Fatal << e.what() << std::endl; + else + Log::Warn << e.what() << std::endl; + + return false; + } +} + +} // namespace data +} // namespace mlpack + +#endif From c6c25beb061134a65c7d64a069892cb0c390573a Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 19 Feb 2017 11:49:15 +0800 Subject: [PATCH 35/40] use std::string to replace raw buffer, cpp11 guarantee memory layout is contiguous --- src/mlpack/core/data/load_impl.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index e44fa4fe39e..2dd646d5f03 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -170,15 +170,16 @@ bool Load(const std::string& filename, // This is taken from load_auto_detect() in diskio_meat.hpp const std::string ARMA_MAT_TXT = "ARMA_MAT_TXT"; - char* rawHeader = new char[ARMA_MAT_TXT.length() + 1]; + //char* rawHeader = new char[ARMA_MAT_TXT.length() + 1]; + std::string rawHeader(ARMA_MAT_TXT.length()); std::streampos pos = stream.tellg(); - stream.read(rawHeader, std::streamsize(ARMA_MAT_TXT.length())); - rawHeader[ARMA_MAT_TXT.length()] = '\0'; + stream.read(&rawHeader[0], std::streamsize(ARMA_MAT_TXT.length())); + //rawHeader[ARMA_MAT_TXT.length()] = '\0'; stream.clear(); stream.seekg(pos); // Reset stream position after peeking. - if (std::string(rawHeader) == ARMA_MAT_TXT) + if (rawHeader == ARMA_MAT_TXT) { loadType = arma::arma_ascii; stringType = "Armadillo ASCII formatted data"; @@ -193,9 +194,7 @@ bool Load(const std::string& filename, stringType = "CSV data"; else // Unknown .txt... we will throw an error. unknownType = true; - } - - delete[] rawHeader; + } } else if (extension == "bin") { From 09983f6f893a03380a877140680215917995c36d Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 19 Feb 2017 12:30:05 +0800 Subject: [PATCH 36/40] 1 : fix bug, wrong constructor 2 : use std::string to replace raw buffer --- src/mlpack/core/data/load_impl.hpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 2dd646d5f03..a8e73272bd1 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -171,11 +171,10 @@ bool Load(const std::string& filename, // This is taken from load_auto_detect() in diskio_meat.hpp const std::string ARMA_MAT_TXT = "ARMA_MAT_TXT"; //char* rawHeader = new char[ARMA_MAT_TXT.length() + 1]; - std::string rawHeader(ARMA_MAT_TXT.length()); + std::string rawHeader(ARMA_MAT_TXT.length(), '\0'); std::streampos pos = stream.tellg(); stream.read(&rawHeader[0], std::streamsize(ARMA_MAT_TXT.length())); - //rawHeader[ARMA_MAT_TXT.length()] = '\0'; stream.clear(); stream.seekg(pos); // Reset stream position after peeking. @@ -200,17 +199,16 @@ bool Load(const std::string& filename, { // This could be raw binary or Armadillo binary (binary with header). We // will check to see if it is Armadillo binary. - const std::string ARMA_MAT_BIN = "ARMA_MAT_BIN"; - char *rawHeader = new char[ARMA_MAT_BIN.length() + 1]; + const std::string ARMA_MAT_BIN = "ARMA_MAT_BIN"; + std::string rawHeader(ARMA_MAT_BIN.length(), '\0'); std::streampos pos = stream.tellg(); - stream.read(rawHeader, std::streamsize(ARMA_MAT_BIN.length())); - rawHeader[ARMA_MAT_BIN.length()] = '\0'; + stream.read(&rawHeader[0], std::streamsize(ARMA_MAT_BIN.length())); stream.clear(); stream.seekg(pos); // Reset stream position after peeking. - if (std::string(rawHeader) == ARMA_MAT_BIN) + if (rawHeader == ARMA_MAT_BIN) { stringType = "Armadillo binary formatted data"; loadType = arma::arma_binary; @@ -219,9 +217,7 @@ bool Load(const std::string& filename, { stringType = "raw binary formatted data"; loadType = arma::raw_binary; - } - - delete[] rawHeader; + } } else if (extension == "pgm") { From 6cd94b63e82440ad3366b77a0d38295c093b81f8 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 19 Feb 2017 12:30:54 +0800 Subject: [PATCH 37/40] fix format --- src/mlpack/core/data/load.cpp | 25 +++++++++++++++---------- src/mlpack/core/data/load.hpp | 25 +++++++++++++++---------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/mlpack/core/data/load.cpp b/src/mlpack/core/data/load.cpp index f3167669921..ab843b4736a 100644 --- a/src/mlpack/core/data/load.cpp +++ b/src/mlpack/core/data/load.cpp @@ -10,19 +10,24 @@ template bool Load(const std::string&, arma::Mat&, const bool, con template bool Load(const std::string&, arma::Mat&, const bool, const bool); template bool Load(const std::string&, arma::Mat&, const bool, const bool); -template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose); +template bool Load(const std::string&, arma::Mat&, + DatasetMapper&, + const bool, const bool); -template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose); +template bool Load(const std::string&, arma::Mat&, + DatasetMapper&, + const bool, const bool); -template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose); +template bool Load(const std::string&, arma::Mat&, + DatasetMapper&, + const bool, const bool); -template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose); +template bool Load(const std::string&, arma::Mat&, + DatasetMapper&, + const bool, const bool); -template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose); +template bool Load(const std::string&, arma::Mat&, + DatasetMapper&, + const bool, const bool); }} diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index df72d8314a5..92c8db875e8 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -109,20 +109,25 @@ bool Load(const std::string& filename, const bool fatal = false, const bool transpose = true); -extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose); +extern template bool Load(const std::string&, arma::Mat&, + DatasetMapper&, + const bool, const bool); -extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose); +extern template bool Load(const std::string&, arma::Mat&, + DatasetMapper&, + const bool, const bool); -extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose); +extern template bool Load(const std::string&, arma::Mat&, + DatasetMapper&, + const bool, const bool); -extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose); +extern template bool Load(const std::string&, arma::Mat&, + DatasetMapper&, + const bool, const bool); -extern template bool Load(std::string const&, arma::Mat&, DatasetMapper&, -const bool fatal, const bool transpose); +extern template bool Load(const std::string&, arma::Mat&, + DatasetMapper&, + const bool, const bool); /** * Load a model from a file, guessing the filetype from the extension, or, From 0d97eb9300c7efb34fb7ce0d87400ec9fcec7332 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 26 Feb 2017 07:18:27 +0800 Subject: [PATCH 38/40] remove useless file --- src/mlpack/core/data/dataset_info.hpp | 124 -------------------------- 1 file changed, 124 deletions(-) delete mode 100644 src/mlpack/core/data/dataset_info.hpp diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp deleted file mode 100644 index 29663f1260a..00000000000 --- a/src/mlpack/core/data/dataset_info.hpp +++ /dev/null @@ -1,124 +0,0 @@ -/** - * @file dataset_info.hpp - * @author Ryan Curtin - * - * Defines the DatasetInfo class, which holds information about a dataset. This - * is useful when the dataset contains categorical non-numeric features that - * needs to be mapped to categorical numeric features. - */ -#ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP -#define MLPACK_CORE_DATA_DATASET_INFO_HPP - -#include -#include -#include - -namespace mlpack { -namespace data { - -/** - * The Datatype enum specifies the types of data mlpack algorithms can use. The - * vast majority of mlpack algorithms can only use numeric data (i.e. - * float/double/etc.), but some algorithms can use categorical data, specified - * via this Datatype enum and the DatasetInfo class. - */ -enum Datatype : bool /* bool is all the precision we need for two types */ -{ - numeric = 0, - categorical = 1 -}; - -/** - * Auxiliary information for a dataset, including mappings to/from strings and - * the datatype of each dimension. DatasetInfo objects are optionally produced - * by data::Load(), and store the type of each dimension (Datatype::numeric or - * Datatype::categorical) as well as mappings from strings to unsigned integers - * and vice versa. - */ -class DatasetInfo -{ - public: - /** - * Create the DatasetInfo object with the given dimensionality. Note that the - * dimensionality cannot be changed later; you will have to create a new - * DatasetInfo object. - */ - DatasetInfo(const size_t dimensionality = 0); - - /** - * Given the string and the dimension to which it belongs, return its numeric - * mapping. If no mapping yet exists, the string is added to the list of - * mappings for the given dimension. The dimension parameter refers to the - * index of the dimension of the string (i.e. the row in the dataset). - * - * @param string String to find/create mapping for. - * @param dimension Index of the dimension of the string. - */ - size_t MapString(const std::string &string, const size_t dimension) - { - return MapStringImpl(string, dimension); - } - - size_t MapString(std::string &&string, const size_t dimension) - { - return MapStringImpl(std::move(string), dimension); - } - - /** - * Return the string that corresponds to a given value in a given dimension. - * If the string is not a valid mapping in the given dimension, a - * std::invalid_argument is thrown. - * - * @param value Mapped value for string. - * @param dimension Dimension to unmap string from. - */ - const std::string& UnmapString(const size_t value, const size_t dimension); - - //! Return the type of a given dimension (numeric or categorical). - Datatype Type(const size_t dimension) const; - //! Modify the type of a given dimension (be careful!). - Datatype& Type(const size_t dimension); - - /** - * Get the number of mappings for a particular dimension. If the dimension - * is numeric, then this will return 0. - */ - size_t NumMappings(const size_t dimension) const; - - /** - * Get the dimensionality of the DatasetInfo object (that is, how many - * dimensions it has information for). If this object was created by a call - * to mlpack::data::Load(), then the dimensionality will be the same as the - * number of rows (dimensions) in the dataset. - */ - size_t Dimensionality() const; - - /** - * Serialize the dataset information. - */ - template - void Serialize(Archive& ar, const unsigned int /* version */) - { - ar & data::CreateNVP(types, "types"); - ar & data::CreateNVP(maps, "maps"); - } - - private: - //! Types of each dimension. - std::vector types; - - //! Mappings from strings to integers. Map entries will only exist for - //! dimensions that are categorical. - std::unordered_map, - size_t>> maps; - - template - size_t MapStringImpl(T&& string, const size_t dimension); -}; - -} // namespace data -} // namespace mlpack - -#include "dataset_info_impl.hpp" - -#endif From fd3b4b16da2a755a1041a6866a1293deb7e8be64 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 26 Feb 2017 07:25:51 +0800 Subject: [PATCH 39/40] add license --- src/mlpack/core/data/load_csv.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 21f9b287ea7..f9e3fb393b5 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -3,6 +3,11 @@ * @author ThamNgapWei * * This is a csv parsers which use to parse the csv file format + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. */ #ifndef MLPACK_CORE_DATA_LOAD_CSV_HPP #define MLPACK_CORE_DATA_LOAD_CSV_HPP From b92a4c1a4c19aaf51f69afa9693db5d4e17f55e8 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 12 Mar 2017 09:20:48 +0800 Subject: [PATCH 40/40] use preprocessor to omit extra instant of Load function under windows --- src/mlpack/core/data/load.cpp | 2 ++ src/mlpack/core/data/load.hpp | 3 +++ 2 files changed, 5 insertions(+) diff --git a/src/mlpack/core/data/load.cpp b/src/mlpack/core/data/load.cpp index ab843b4736a..9a501099eab 100644 --- a/src/mlpack/core/data/load.cpp +++ b/src/mlpack/core/data/load.cpp @@ -26,8 +26,10 @@ template bool Load(const std::string&, arma::Mat&, const bool, const bool); +#ifndef _WIN32 template bool Load(const std::string&, arma::Mat&, DatasetMapper&, const bool, const bool); +#endif }} diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 92c8db875e8..e9d9a5496f3 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -68,7 +68,10 @@ extern template bool Load(const std::string&, arma::Mat&, const bool, extern template bool Load(const std::string&, arma::Mat&, const bool, const bool); extern template bool Load(const std::string&, arma::Mat&, const bool, const bool); extern template bool Load(const std::string&, arma::Mat&, const bool, const bool); + +#ifndef _WIN32 extern template bool Load(const std::string&, arma::Mat&, const bool, const bool); +#endif /** * Loads a matrix from a file, guessing the filetype from the extension and