From 1c48343e3ea585ab0f06222ae18ff836cb2137cc Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 16 May 2021 04:25:55 +0530 Subject: [PATCH 001/112] Adding new parser to mlpack::data:: --- src/mlpack/core/data/CMakeLists.txt | 2 + src/mlpack/core/data/new_parser.hpp | 35 +++ src/mlpack/core/data/new_parser_impl.hpp | 287 +++++++++++++++++++++++ 3 files changed, 324 insertions(+) create mode 100644 src/mlpack/core/data/new_parser.hpp create mode 100644 src/mlpack/core/data/new_parser_impl.hpp diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index d9d706745ca..a8b2d948a45 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -34,6 +34,8 @@ set(SOURCES confusion_matrix.hpp one_hot_encoding.hpp one_hot_encoding_impl.hpp + new_parser.hpp + new_parser_impl.hpp ) # add directory name to sources diff --git a/src/mlpack/core/data/new_parser.hpp b/src/mlpack/core/data/new_parser.hpp new file mode 100644 index 00000000000..9a95ecb36ab --- /dev/null +++ b/src/mlpack/core/data/new_parser.hpp @@ -0,0 +1,35 @@ +#ifndef MLPACK_CORE_DATA_DISKIO_CURRY_HPP +#define MLPACK_CORE_DATA_DISKIO_CURRY_HPP + +namespace mlpack { +namespace data { + +template +inline +bool +convert_token(eT& val, const std::string& token); + +template +inline +bool +convert_token(std::complex& val, const std::string& token); + +template +inline +bool +load_csv_ascii(arma::Mat& x, std::istream& f, std::string&); + +template +inline +arma_cold +bool +load_data(const std::string name, const arma::file_type type); + + + } // namespace data +} // namepsace mlpack + +// Include implementation +#include "diskio_curry_impl.hpp" + +#endif diff --git a/src/mlpack/core/data/new_parser_impl.hpp b/src/mlpack/core/data/new_parser_impl.hpp new file mode 100644 index 00000000000..93bfd42841a --- /dev/null +++ b/src/mlpack/core/data/new_parser_impl.hpp @@ -0,0 +1,287 @@ +#ifndef MLPACK_CORE_DATA_DISKIO_CURRY__IMPL_HPP +#define MLPACK_CORE_DATA_DISKIO_CURRY_IMPL_HPP + +#include "diskio_curry.hpp" + +namespace mlpack { +namespace data { + +template +inline +bool +convert_token(eT& val, const std::string& token) + { + const size_t N = size_t(token.length()); + + if(N == 0) { val = eT(0); return true; } + + const char* str = token.c_str(); + + if( (N == 3) || (N == 4) ) + { + const bool neg = (str[0] == '-'); + const bool pos = (str[0] == '+'); + + const size_t offset = ( (neg || pos) && (N == 4) ) ? 1 : 0; + + const char sig_a = str[offset ]; + const char sig_b = str[offset+1]; + const char sig_c = str[offset+2]; + + if( ((sig_a == 'i') || (sig_a == 'I')) && ((sig_b == 'n') || (sig_b == 'N')) && ((sig_c == 'f') || (sig_c == 'F')) ) + { + val = neg ? arma::cond_rel< arma::is_signed::value >::make_neg(arma::Datum::inf) : arma::Datum::inf; + + return true; + } + else + if( ((sig_a == 'n') || (sig_a == 'N')) && ((sig_b == 'a') || (sig_b == 'A')) && ((sig_c == 'n') || (sig_c == 'N')) ) + { + val = arma::Datum::nan; + + return true; + } + } + + + char* endptr = nullptr; + + if(arma::is_real::value) + { + val = eT( std::strtod(str, &endptr) ); + } + else + { + if(arma::is_signed::value) + { + // signed integer + + val = eT( std::strtoll(str, &endptr, 10) ); + } + else + { + // unsigned integer + + if(str[0] == '-') { val = eT(0); return true; } + + val = eT( std::strtoull(str, &endptr, 10) ); + } + } + + if(str == endptr) { return false; } + + return true; + } + + + +template +inline +bool +convert_token(std::complex& val, const std::string& token) + { + const size_t N = size_t(token.length()); + const size_t Nm1 = N-1; + + if(N == 0) { val = std::complex(0); return true; } + + const char* str = token.c_str(); + + // valid complex number formats: + // (real,imag) + // (real) + // () + + if( (token[0] != '(') || (token[Nm1] != ')') ) + { + // no brackets, so treat the token as a non-complex number + + T val_real; + + const bool state = convert_token(val_real, token); // use the non-complex version of this function + + val = std::complex(val_real); + + return state; + } + + // does the token contain only the () brackets? + if(N <= 2) { val = std::complex(0); return true; } + + size_t comma_loc = 0; + bool comma_found = false; + + for(size_t i=0; i(val_real); + } + else + { + const std::string token_real( &(str[1]), (comma_loc - 1 ) ); + const std::string token_imag( &(str[comma_loc+1]), (Nm1 - 1 - comma_loc) ); + + T val_real; + T val_imag; + + const bool state_real = convert_token(val_real, token_real); + const bool state_imag = convert_token(val_imag, token_imag); + + state = (state_real && state_imag); + + val = std::complex(val_real, val_imag); + } + + return state; + } + +//! Load a matrix in CSV text format (human readable) +template +inline +bool +load_csv_ascii(arma::Mat& x, std::istream& f, std::string&) + { + // TODO: replace with more efficient implementation + + bool load_okay = f.good(); + + f.clear(); + const std::fstream::pos_type pos1 = f.tellg(); + + // + // work out the size + + arma::uword f_n_rows = 0; + arma::uword f_n_cols = 0; + + std::string line_string; + std::stringstream line_stream; + + std::string token; + + while( f.good() && load_okay ) + { + std::getline(f, line_string); + + if(line_string.size() == 0) { break; } + + line_stream.clear(); + line_stream.str(line_string); + + arma::uword line_n_cols = 0; + + while(line_stream.good()) + { + std::getline(line_stream, token, ','); + ++line_n_cols; + } + + if(f_n_cols < line_n_cols) { f_n_cols = line_n_cols; } + + ++f_n_rows; + } + + f.clear(); + f.seekg(pos1); + + x.zeros(f_n_rows, f_n_cols); + + arma::uword row = 0; + + while(f.good()) + { + std::getline(f, line_string); + + if(line_string.size() == 0) { break; } + + line_stream.clear(); + line_stream.str(line_string); + + arma::uword col = 0; + + while(line_stream.good()) + { + std::getline(line_stream, token, ','); + + convert_token( x.at(row,col), token ); + + ++col; + } + + ++row; + } + + return load_okay; + } + +template +inline +arma_cold +bool +load_data(arma::Mat& x, const arma::file_type type, std::istream& f) + { + + bool load_okay = false; + std::string err_msg; + std::string g = "y"; + switch(type) + { + case arma::auto_detect: + load_okay = true; + break; + + case arma::csv_ascii: + load_okay = load_csv_ascii(x, f, g); + case arma::raw_ascii: + load_okay = true; + break; + + case arma::arma_ascii: + load_okay = true; + break; + + case arma::coord_ascii: + load_okay = true; + break; + + case arma::raw_binary: + load_okay = true; + break; + + case arma::arma_binary: + load_okay = true; + break; + + case arma::pgm_binary: + load_okay = true; + break; + + case arma::hdf5_binary: + return true; + break; + + case arma::hdf5_binary_trans: // kept for compatibility with earlier versions of Armadillo + return true; + break; + + default: + load_okay = false; + } + + return load_okay; + } +} // namespace data +} // namespace mlpack + +#endif From fe70a12eab8a3bacc41edfe14df85db1dc03a25a Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 16 May 2021 16:03:31 +0530 Subject: [PATCH 002/112] changes --- src/mlpack/core/data/new_parser.hpp | 2 +- src/mlpack/core/data/new_parser_impl.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/new_parser.hpp b/src/mlpack/core/data/new_parser.hpp index 9a95ecb36ab..fa7410408c3 100644 --- a/src/mlpack/core/data/new_parser.hpp +++ b/src/mlpack/core/data/new_parser.hpp @@ -30,6 +30,6 @@ load_data(const std::string name, const arma::file_type type); } // namepsace mlpack // Include implementation -#include "diskio_curry_impl.hpp" +#include "new_parser_impl.hpp" #endif diff --git a/src/mlpack/core/data/new_parser_impl.hpp b/src/mlpack/core/data/new_parser_impl.hpp index 93bfd42841a..61e0e84c787 100644 --- a/src/mlpack/core/data/new_parser_impl.hpp +++ b/src/mlpack/core/data/new_parser_impl.hpp @@ -1,7 +1,7 @@ #ifndef MLPACK_CORE_DATA_DISKIO_CURRY__IMPL_HPP #define MLPACK_CORE_DATA_DISKIO_CURRY_IMPL_HPP -#include "diskio_curry.hpp" +#include "new_parser.hpp" namespace mlpack { namespace data { From 0919664c36e3d300a45cdc9c7eafb2c8a637cc73 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 16 May 2021 17:33:11 +0530 Subject: [PATCH 003/112] changes --- src/mlpack/core/data/new_parser.hpp | 4 ++-- src/mlpack/core/data/new_parser_impl.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mlpack/core/data/new_parser.hpp b/src/mlpack/core/data/new_parser.hpp index fa7410408c3..e5070374793 100644 --- a/src/mlpack/core/data/new_parser.hpp +++ b/src/mlpack/core/data/new_parser.hpp @@ -1,5 +1,5 @@ -#ifndef MLPACK_CORE_DATA_DISKIO_CURRY_HPP -#define MLPACK_CORE_DATA_DISKIO_CURRY_HPP +#ifndef MLPACK_CORE_DATA_NEW_PARSER_HPP +#define MLPACK_CORE_DATA_NEW_PARSER_HPP namespace mlpack { namespace data { diff --git a/src/mlpack/core/data/new_parser_impl.hpp b/src/mlpack/core/data/new_parser_impl.hpp index 61e0e84c787..9748af33f6e 100644 --- a/src/mlpack/core/data/new_parser_impl.hpp +++ b/src/mlpack/core/data/new_parser_impl.hpp @@ -1,5 +1,5 @@ -#ifndef MLPACK_CORE_DATA_DISKIO_CURRY__IMPL_HPP -#define MLPACK_CORE_DATA_DISKIO_CURRY_IMPL_HPP +#ifndef MLPACK_CORE_DATA_NEW_PARSER_IMPL_HPP +#define MLPACK_CORE_DATA_NEW_PARSER_IMPL_HPP #include "new_parser.hpp" From aa649bdddf5e64ca23c1809eba7e23cb99da3165 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 16 May 2021 18:02:29 +0530 Subject: [PATCH 004/112] style checks --- src/mlpack/core/data/new_parser.hpp | 2 +- src/mlpack/core/data/new_parser_impl.hpp | 538 ++++++++++++----------- 2 files changed, 284 insertions(+), 256 deletions(-) diff --git a/src/mlpack/core/data/new_parser.hpp b/src/mlpack/core/data/new_parser.hpp index e5070374793..dd71f056729 100644 --- a/src/mlpack/core/data/new_parser.hpp +++ b/src/mlpack/core/data/new_parser.hpp @@ -27,7 +27,7 @@ load_data(const std::string name, const arma::file_type type); } // namespace data -} // namepsace mlpack +} // namespace mlpack // Include implementation #include "new_parser_impl.hpp" diff --git a/src/mlpack/core/data/new_parser_impl.hpp b/src/mlpack/core/data/new_parser_impl.hpp index 9748af33f6e..efddb8af3e2 100644 --- a/src/mlpack/core/data/new_parser_impl.hpp +++ b/src/mlpack/core/data/new_parser_impl.hpp @@ -3,285 +3,313 @@ #include "new_parser.hpp" -namespace mlpack { -namespace data { - -template -inline -bool -convert_token(eT& val, const std::string& token) +namespace mlpack +{ + namespace data { - const size_t N = size_t(token.length()); - - if(N == 0) { val = eT(0); return true; } - - const char* str = token.c_str(); - - if( (N == 3) || (N == 4) ) + + template + inline bool + convert_token(eT &val, const std::string &token) { - const bool neg = (str[0] == '-'); - const bool pos = (str[0] == '+'); - - const size_t offset = ( (neg || pos) && (N == 4) ) ? 1 : 0; - - const char sig_a = str[offset ]; - const char sig_b = str[offset+1]; - const char sig_c = str[offset+2]; - - if( ((sig_a == 'i') || (sig_a == 'I')) && ((sig_b == 'n') || (sig_b == 'N')) && ((sig_c == 'f') || (sig_c == 'F')) ) + const size_t N = size_t(token.length()); + + if (N == 0) { - val = neg ? arma::cond_rel< arma::is_signed::value >::make_neg(arma::Datum::inf) : arma::Datum::inf; - - return true; + val = eT(0); + return true; } - else - if( ((sig_a == 'n') || (sig_a == 'N')) && ((sig_b == 'a') || (sig_b == 'A')) && ((sig_c == 'n') || (sig_c == 'N')) ) + + const char *str = token.c_str(); + + if ((N == 3) || (N == 4)) { - val = arma::Datum::nan; - - return true; + const bool neg = (str[0] == '-'); + const bool pos = (str[0] == '+'); + + const size_t offset = ((neg || pos) && (N == 4)) ? 1 : 0; + + const char sig_a = str[offset]; + const char sig_b = str[offset + 1]; + const char sig_c = str[offset + 2]; + + if (((sig_a == 'i') || (sig_a == 'I')) && ((sig_b == 'n') || (sig_b == 'N')) && ((sig_c == 'f') || (sig_c == 'F'))) + { + val = neg ? arma::cond_rel::value>::make_neg(arma::Datum::inf) : arma::Datum::inf; + + return true; + } + else if (((sig_a == 'n') || (sig_a == 'N')) && ((sig_b == 'a') || (sig_b == 'A')) && ((sig_c == 'n') || (sig_c == 'N'))) + { + val = arma::Datum::nan; + + return true; + } } + + char *endptr = nullptr; + + if (arma::is_real::value) + { + val = eT(std::strtod(str, &endptr)); + } + else + { + if (arma::is_signed::value) + { + // signed integer + + val = eT(std::strtoll(str, &endptr, 10)); + } + else + { + // unsigned integer + + if (str[0] == '-') + { + val = eT(0); + return true; + } + + val = eT(std::strtoull(str, &endptr, 10)); + } + } + + if (str == endptr) + { + return false; + } + + return true; } - - - char* endptr = nullptr; - - if(arma::is_real::value) - { - val = eT( std::strtod(str, &endptr) ); - } - else + + template + inline bool + convert_token(std::complex &val, const std::string &token) { - if(arma::is_signed::value) + const size_t N = size_t(token.length()); + const size_t Nm1 = N - 1; + + if (N == 0) { - // signed integer - - val = eT( std::strtoll(str, &endptr, 10) ); + val = std::complex(0); + return true; } - else + + const char *str = token.c_str(); + + // valid complex number formats: + // (real,imag) + // (real) + // () + + if ((token[0] != '(') || (token[Nm1] != ')')) { - // unsigned integer - - if(str[0] == '-') { val = eT(0); return true; } - - val = eT( std::strtoull(str, &endptr, 10) ); + // no brackets, so treat the token as a non-complex number + + T val_real; + + const bool state = convert_token(val_real, token); // use the non-complex version of this function + + val = std::complex(val_real); + + return state; + } + + // does the token contain only the () brackets? + if (N <= 2) + { + val = std::complex(0); + return true; } - } - - if(str == endptr) { return false; } - - return true; - } + size_t comma_loc = 0; + bool comma_found = false; + for (size_t i = 0; i < N; ++i) + { + if (str[i] == ',') + { + comma_loc = i; + comma_found = true; + break; + } + } -template -inline -bool -convert_token(std::complex& val, const std::string& token) - { - const size_t N = size_t(token.length()); - const size_t Nm1 = N-1; - - if(N == 0) { val = std::complex(0); return true; } - - const char* str = token.c_str(); - - // valid complex number formats: - // (real,imag) - // (real) - // () - - if( (token[0] != '(') || (token[Nm1] != ')') ) - { - // no brackets, so treat the token as a non-complex number - - T val_real; - - const bool state = convert_token(val_real, token); // use the non-complex version of this function - - val = std::complex(val_real); - - return state; - } - - // does the token contain only the () brackets? - if(N <= 2) { val = std::complex(0); return true; } - - size_t comma_loc = 0; - bool comma_found = false; - - for(size_t i=0; i(val_real); - } - else - { - const std::string token_real( &(str[1]), (comma_loc - 1 ) ); - const std::string token_imag( &(str[comma_loc+1]), (Nm1 - 1 - comma_loc) ); - - T val_real; - T val_imag; - - const bool state_real = convert_token(val_real, token_real); - const bool state_imag = convert_token(val_imag, token_imag); - - state = (state_real && state_imag); - - val = std::complex(val_real, val_imag); - } - - return state; - } - -//! Load a matrix in CSV text format (human readable) -template -inline -bool -load_csv_ascii(arma::Mat& x, std::istream& f, std::string&) - { - // TODO: replace with more efficient implementation - - bool load_okay = f.good(); - - f.clear(); - const std::fstream::pos_type pos1 = f.tellg(); - - // - // work out the size - - arma::uword f_n_rows = 0; - arma::uword f_n_cols = 0; - - std::string line_string; - std::stringstream line_stream; - - std::string token; - - while( f.good() && load_okay ) - { - std::getline(f, line_string); - - if(line_string.size() == 0) { break; } - - line_stream.clear(); - line_stream.str(line_string); - - arma::uword line_n_cols = 0; - - while(line_stream.good()) + bool state = false; + + if (comma_found == false) + { + // only the real part is available + + const std::string token_real(&(str[1]), (Nm1 - 1)); + + T val_real; + + state = convert_token(val_real, token_real); // use the non-complex version of this function + + val = std::complex(val_real); + } + else { - std::getline(line_stream, token, ','); - ++line_n_cols; + const std::string token_real(&(str[1]), (comma_loc - 1)); + const std::string token_imag(&(str[comma_loc + 1]), (Nm1 - 1 - comma_loc)); + + T val_real; + T val_imag; + + const bool state_real = convert_token(val_real, token_real); + const bool state_imag = convert_token(val_imag, token_imag); + + state = (state_real && state_imag); + + val = std::complex(val_real, val_imag); } - - if(f_n_cols < line_n_cols) { f_n_cols = line_n_cols; } - - ++f_n_rows; + + return state; } - - f.clear(); - f.seekg(pos1); - - x.zeros(f_n_rows, f_n_cols); - - arma::uword row = 0; - - while(f.good()) + + //! Load a matrix in CSV text format (human readable) + template + inline bool + load_csv_ascii(arma::Mat &x, std::istream &f, std::string &) { - std::getline(f, line_string); - - if(line_string.size() == 0) { break; } - - line_stream.clear(); - line_stream.str(line_string); - - arma::uword col = 0; - - while(line_stream.good()) + // TODO: replace with more efficient implementation + + bool load_okay = f.good(); + + f.clear(); + const std::fstream::pos_type pos1 = f.tellg(); + + // + // work out the size + + arma::uword f_n_rows = 0; + arma::uword f_n_cols = 0; + + std::string line_string; + std::stringstream line_stream; + + std::string token; + + while (f.good() && load_okay) { - std::getline(line_stream, token, ','); - - convert_token( x.at(row,col), token ); - - ++col; + std::getline(f, line_string); + + if (line_string.size() == 0) + { + break; + } + + line_stream.clear(); + line_stream.str(line_string); + + arma::uword line_n_cols = 0; + + while (line_stream.good()) + { + std::getline(line_stream, token, ','); + ++line_n_cols; + } + + if (f_n_cols < line_n_cols) + { + f_n_cols = line_n_cols; + } + + ++f_n_rows; } - - ++row; + + f.clear(); + f.seekg(pos1); + + x.zeros(f_n_rows, f_n_cols); + + arma::uword row = 0; + + while (f.good()) + { + std::getline(f, line_string); + + if (line_string.size() == 0) + { + break; + } + + line_stream.clear(); + line_stream.str(line_string); + + arma::uword col = 0; + + while (line_stream.good()) + { + std::getline(line_stream, token, ','); + + convert_token(x.at(row, col), token); + + ++col; + } + + ++row; + } + + return load_okay; } - - return load_okay; - } - -template -inline -arma_cold -bool -load_data(arma::Mat& x, const arma::file_type type, std::istream& f) - { - bool load_okay = false; - std::string err_msg; - std::string g = "y"; - switch(type) + template + inline arma_cold bool + load_data(arma::Mat &x, const arma::file_type type, std::istream &f) { - case arma::auto_detect: - load_okay = true; - break; - - case arma::csv_ascii: - load_okay = load_csv_ascii(x, f, g); - case arma::raw_ascii: - load_okay = true; - break; - - case arma::arma_ascii: - load_okay = true; - break; - - case arma::coord_ascii: - load_okay = true; - break; - - case arma::raw_binary: - load_okay = true; - break; - - case arma::arma_binary: - load_okay = true; - break; - - case arma::pgm_binary: - load_okay = true; - break; - - case arma::hdf5_binary: - return true; - break; - - case arma::hdf5_binary_trans: // kept for compatibility with earlier versions of Armadillo - return true; - break; - - default: - load_okay = false; + bool load_okay = false; + std::string err_msg; + std::string g = "y"; + switch (type) + { + case arma::auto_detect: + load_okay = true; + break; + + case arma::csv_ascii: + load_okay = load_csv_ascii(x, f, g); + case arma::raw_ascii: + load_okay = true; + break; + + case arma::arma_ascii: + load_okay = true; + break; + + case arma::coord_ascii: + load_okay = true; + break; + + case arma::raw_binary: + load_okay = true; + break; + + case arma::arma_binary: + load_okay = true; + break; + + case arma::pgm_binary: + load_okay = true; + break; + + case arma::hdf5_binary: + return true; + break; + + case arma::hdf5_binary_trans: // kept for compatibility with earlier versions of Armadillo + return true; + break; + + default: + load_okay = false; + } + + return load_okay; } - - return load_okay; - } -} // namespace data -} // namespace mlpack + } // namespace data +} // namespace mlpack #endif From 08b0d1637b96cef62461c3ee9dcc8aaaee2f34cd Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Mon, 17 May 2021 12:44:28 +0530 Subject: [PATCH 005/112] Adding source of the original file --- src/mlpack/core/data/new_parser.hpp | 6 ++++++ src/mlpack/core/data/new_parser_impl.hpp | 3 +++ 2 files changed, 9 insertions(+) diff --git a/src/mlpack/core/data/new_parser.hpp b/src/mlpack/core/data/new_parser.hpp index dd71f056729..9c72b410a0e 100644 --- a/src/mlpack/core/data/new_parser.hpp +++ b/src/mlpack/core/data/new_parser.hpp @@ -1,3 +1,9 @@ +/* + * Fucntions defined in this files originate from armadillo + * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp + * and are adapted for mlpack +*/ + #ifndef MLPACK_CORE_DATA_NEW_PARSER_HPP #define MLPACK_CORE_DATA_NEW_PARSER_HPP diff --git a/src/mlpack/core/data/new_parser_impl.hpp b/src/mlpack/core/data/new_parser_impl.hpp index efddb8af3e2..d1e1c2c21bf 100644 --- a/src/mlpack/core/data/new_parser_impl.hpp +++ b/src/mlpack/core/data/new_parser_impl.hpp @@ -1,3 +1,6 @@ +/* This file is originated from armadillo and adapted for mlpack + * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp +*/ #ifndef MLPACK_CORE_DATA_NEW_PARSER_IMPL_HPP #define MLPACK_CORE_DATA_NEW_PARSER_IMPL_HPP From 4b672a7e5e5c9af689369ce252305771ad8dc9a3 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Mon, 17 May 2021 13:09:14 +0530 Subject: [PATCH 006/112] Adding license --- src/mlpack/core.hpp | 1 + src/mlpack/core/data/new_parser.hpp | 19 ++++++++++++++++--- src/mlpack/core/data/new_parser_impl.hpp | 14 ++++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/mlpack/core.hpp b/src/mlpack/core.hpp index d29a1747246..e96a772d61c 100644 --- a/src/mlpack/core.hpp +++ b/src/mlpack/core.hpp @@ -90,6 +90,7 @@ #include #include #include +#include // mlpack::backtrace only for linux #ifdef HAS_BFD_DL diff --git a/src/mlpack/core/data/new_parser.hpp b/src/mlpack/core/data/new_parser.hpp index 9c72b410a0e..be79f3173b4 100644 --- a/src/mlpack/core/data/new_parser.hpp +++ b/src/mlpack/core/data/new_parser.hpp @@ -1,7 +1,20 @@ -/* - * Fucntions defined in this files originate from armadillo +/* Fucntions defined in this files originate from armadillo + * This file is originated from armadillo and adapted for mlpack * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp - * and are adapted for mlpack + * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) + * Copyright 2008-2016 National ICT Australia (NICTA) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ------------------------------------------------------------------------ */ #ifndef MLPACK_CORE_DATA_NEW_PARSER_HPP diff --git a/src/mlpack/core/data/new_parser_impl.hpp b/src/mlpack/core/data/new_parser_impl.hpp index d1e1c2c21bf..d8bfe368e0b 100644 --- a/src/mlpack/core/data/new_parser_impl.hpp +++ b/src/mlpack/core/data/new_parser_impl.hpp @@ -1,5 +1,19 @@ /* This file is originated from armadillo and adapted for mlpack * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp + * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) + * Copyright 2008-2016 National ICT Australia (NICTA) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ------------------------------------------------------------------------ */ #ifndef MLPACK_CORE_DATA_NEW_PARSER_IMPL_HPP #define MLPACK_CORE_DATA_NEW_PARSER_IMPL_HPP From a8534e89386c2512520e35a2c6927d9b32927e67 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 2 Jun 2021 15:42:11 +0530 Subject: [PATCH 007/112] Changed the name to csv_parser, minor style changes --- src/mlpack/core/data/CMakeLists.txt | 4 +- src/mlpack/core/data/csv_parser.hpp | 95 +++++++++++ src/mlpack/core/data/csv_parser_impl.hpp | 202 +++++++++++++++++++++++ 3 files changed, 299 insertions(+), 2 deletions(-) create mode 100644 src/mlpack/core/data/csv_parser.hpp create mode 100644 src/mlpack/core/data/csv_parser_impl.hpp diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index a8b2d948a45..746e4cd0209 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -34,8 +34,8 @@ set(SOURCES confusion_matrix.hpp one_hot_encoding.hpp one_hot_encoding_impl.hpp - new_parser.hpp - new_parser_impl.hpp + csv_parser.hpp + csv_parser_impl.hpp ) # add directory name to sources diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp new file mode 100644 index 00000000000..dda719c4019 --- /dev/null +++ b/src/mlpack/core/data/csv_parser.hpp @@ -0,0 +1,95 @@ +/* Fucntions defined in this files originate from armadillo + * This file is originated from armadillo and adapted for mlpack + * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp + * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) + * Copyright 2008-2016 National ICT Australia (NICTA) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ------------------------------------------------------------------------ +*/ + +#ifndef MLPACK_CORE_DATA_CSV_PARSER_HPP +#define MLPACK_CORE_DATA_CSV_PARSER_HPP + +namespace mlpack +{ +namespace data +{ + + // template + // inline + // bool + // convert_token(eT& val, const std::string& token); + + // template + // inline + // bool + // convert_token(std::complex& val, const std::string& token); + + // template + // inline + // bool + // load_csv_ascii(arma::Mat& x, std::istream& f, std::string&); + + // template + // inline + // arma_cold + // bool + // load_data(const std::string name, const arma::file_type type); + + enum struct file_type : unsigned int + { + file_type_unknown, + auto_detect, //!< attempt to automatically detect the file type + raw_ascii, //!< raw text (ASCII), without a header + arma_ascii, //!< Armadillo text format, with a header specifying matrix type and size + csv_ascii, //!< comma separated values (CSV), without a header + raw_binary, //!< raw binary format (machine dependent), without a header + arma_binary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size + pgm_binary, //!< Portable Grey Map (greyscale image) + ppm_binary, //!< Portable Pixel Map (colour image), used by the field and cube classes + hdf5_binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data + hdf5_binary_trans, //!< [DO NOT USE - deprecated] as per hdf5_binary, but save/load the data with columns transposed to rows + coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) + }; + + template + struct is_real + { + static constexpr bool value = false; + static constexpr bool yes = false; + static constexpr bool no = true; + }; + + template + inline bool ConvertToken(eT& val, const std::string& token); + + template + inline bool LoadCSV(arma::Mat& x, std::istream& f, std::string&); + + template + inline bool LoadCSV(std::string& name, std::string& err_msg); + + template + inline bool Load(const std::string& name, const file_type type, const bool print_status); + + template + inline bool LoadData(const std::string& name, const file_type type, const bool print_status); + + +} // namespace data +} // namespace mlpack + +// Include implementation +#include "new_parser_impl.hpp" + +#endif diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp new file mode 100644 index 00000000000..81d3fedc13f --- /dev/null +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -0,0 +1,202 @@ +/* This file is originated from armadillo and adapted for mlpack + * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp + * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) + * Copyright 2008-2016 National ICT Australia (NICTA) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ------------------------------------------------------------------------ +*/ +#ifndef MLPACK_CORE_DATA_CSV_PARSER_IMPL_HPP +#define MLPACK_CORE_DATA_CSV_PARSER_IMPL_HPP + +#include "csv_parser.hpp" + +namespace mlpack +{ +namespace data +{ + template + inline bool convert_token(eT& val, const std::string& token) + { + const size_t N = size_t(token.length()); + + if (N == 0) + { + val = eT(0); + return true; + } + + const char* str = token.c_str(); + + if ((N == 3) || (N == 4)) + { + const bool neg = (str[0] == '-'); + const bool pos = (str[0] == '+'); + + const size_t offset = ((neg || pos) && (N == 4)) ? 1 : 0; + + // discuss about this fucntion + } + + char* endptr = nullptr; + + if (is_real::value) + { + val = eT(std::strtod(str, &endptr)); + } + + if(str == endptr) { return false; } + + return true; + } + + template + inline bool load_csv_ascii(arma::Mat& x, std::istream& f, std::string&) + { + bool load_okay = f.good(); + + f.clear(); + const std::fstream::pos_type pos1 = f.tellg(); + + // use own implementation as you don't + // wanna depend on arma in core mlpack + size_t f_n_rows = 0; + size_t f_n_cols = 0; + + std::string line_string; + std::stringstream line_stream; + + std::string token; + + while(f.good() && load_okay) + { + std::getline(f, line_string); + + if (line_string.size() == 0) + { + break; + } + + line_stream.clear(); + line_stream.str(line_string); + + size_t line_n_cols = 0; + + while(line_stream.good()) + { + // reading each element of the row + std::getline(line_stream, token, ','); + ++line_n_cols; + } + + if (f_n_cols < line_n_cols) + { + f_n_cols = line_n_cols; + } + + ++f_n_rows; + } + + f.clear(); + f.seekg(pos1); + + x.zeros(f_n_rows, f_n_cols); + + size_t row = 0; + + while (f.good()) + { + std::getline(f, line_string); + + if (line_string.size() == 0) + { + break; + } + + line_stream.clear(); + line_stream.str(line_string); + + size_t col = 0; + + while (line_stream.good()) + { + std::getline(line_stream, token, ','); + + convert_token(x.at(row, col), token); + + ++col; + } + ++row; + } + + return load_okay; + } + + template + inline bool LoadCSV(const std::string& name, std::string& err_msg) + { + std::fstream f; + arma::Mat x; + + f.open(name.c_str(), std::fstream::in); + + bool load_okay = f.is_open(); + + if (load_okay == false) + { + return false; + } + + if (load_okay) + { + load_okay = LoadCSV(x, f, err_msg); + } + + f.close(); + + return load_okay; + } + + template + inline + bool + Load(const std::string& name, const file_type type, const bool print_status) + { + bool load_okay = false; + std::string err_msg; + + load_okay = LoadCSV(name, err_msg); + + return load_okay; + } + + template + inline bool LoadData(const std::string& name, const file_type type, const bool print_status) + { + bool load_okay = false; + std::string err_msg; + + switch (type) + { + case file_type::csv_ascii: + return Load(name, type, print_status); + + // For own implementation + // return load(name, type, print_status); + + break; + } + } +} // namespace data +} // namespace mlpack + +#endif From 74ad69cdaeac0ad69a5b970707268f5f71fcb3d7 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 2 Jun 2021 15:44:53 +0530 Subject: [PATCH 008/112] minor style changes --- src/mlpack/core/data/new_parser.hpp | 54 ---- src/mlpack/core/data/new_parser_impl.hpp | 332 ----------------------- src/mlpack/core/data/test.cpp | 0 3 files changed, 386 deletions(-) delete mode 100644 src/mlpack/core/data/new_parser.hpp delete mode 100644 src/mlpack/core/data/new_parser_impl.hpp create mode 100644 src/mlpack/core/data/test.cpp diff --git a/src/mlpack/core/data/new_parser.hpp b/src/mlpack/core/data/new_parser.hpp deleted file mode 100644 index be79f3173b4..00000000000 --- a/src/mlpack/core/data/new_parser.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/* Fucntions defined in this files originate from armadillo - * This file is originated from armadillo and adapted for mlpack - * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp - * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) - * Copyright 2008-2016 National ICT Australia (NICTA) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ------------------------------------------------------------------------ -*/ - -#ifndef MLPACK_CORE_DATA_NEW_PARSER_HPP -#define MLPACK_CORE_DATA_NEW_PARSER_HPP - -namespace mlpack { -namespace data { - -template -inline -bool -convert_token(eT& val, const std::string& token); - -template -inline -bool -convert_token(std::complex& val, const std::string& token); - -template -inline -bool -load_csv_ascii(arma::Mat& x, std::istream& f, std::string&); - -template -inline -arma_cold -bool -load_data(const std::string name, const arma::file_type type); - - - } // namespace data -} // namespace mlpack - -// Include implementation -#include "new_parser_impl.hpp" - -#endif diff --git a/src/mlpack/core/data/new_parser_impl.hpp b/src/mlpack/core/data/new_parser_impl.hpp deleted file mode 100644 index d8bfe368e0b..00000000000 --- a/src/mlpack/core/data/new_parser_impl.hpp +++ /dev/null @@ -1,332 +0,0 @@ -/* This file is originated from armadillo and adapted for mlpack - * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp - * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) - * Copyright 2008-2016 National ICT Australia (NICTA) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ------------------------------------------------------------------------ -*/ -#ifndef MLPACK_CORE_DATA_NEW_PARSER_IMPL_HPP -#define MLPACK_CORE_DATA_NEW_PARSER_IMPL_HPP - -#include "new_parser.hpp" - -namespace mlpack -{ - namespace data - { - - template - inline bool - convert_token(eT &val, const std::string &token) - { - const size_t N = size_t(token.length()); - - if (N == 0) - { - val = eT(0); - return true; - } - - const char *str = token.c_str(); - - if ((N == 3) || (N == 4)) - { - const bool neg = (str[0] == '-'); - const bool pos = (str[0] == '+'); - - const size_t offset = ((neg || pos) && (N == 4)) ? 1 : 0; - - const char sig_a = str[offset]; - const char sig_b = str[offset + 1]; - const char sig_c = str[offset + 2]; - - if (((sig_a == 'i') || (sig_a == 'I')) && ((sig_b == 'n') || (sig_b == 'N')) && ((sig_c == 'f') || (sig_c == 'F'))) - { - val = neg ? arma::cond_rel::value>::make_neg(arma::Datum::inf) : arma::Datum::inf; - - return true; - } - else if (((sig_a == 'n') || (sig_a == 'N')) && ((sig_b == 'a') || (sig_b == 'A')) && ((sig_c == 'n') || (sig_c == 'N'))) - { - val = arma::Datum::nan; - - return true; - } - } - - char *endptr = nullptr; - - if (arma::is_real::value) - { - val = eT(std::strtod(str, &endptr)); - } - else - { - if (arma::is_signed::value) - { - // signed integer - - val = eT(std::strtoll(str, &endptr, 10)); - } - else - { - // unsigned integer - - if (str[0] == '-') - { - val = eT(0); - return true; - } - - val = eT(std::strtoull(str, &endptr, 10)); - } - } - - if (str == endptr) - { - return false; - } - - return true; - } - - template - inline bool - convert_token(std::complex &val, const std::string &token) - { - const size_t N = size_t(token.length()); - const size_t Nm1 = N - 1; - - if (N == 0) - { - val = std::complex(0); - return true; - } - - const char *str = token.c_str(); - - // valid complex number formats: - // (real,imag) - // (real) - // () - - if ((token[0] != '(') || (token[Nm1] != ')')) - { - // no brackets, so treat the token as a non-complex number - - T val_real; - - const bool state = convert_token(val_real, token); // use the non-complex version of this function - - val = std::complex(val_real); - - return state; - } - - // does the token contain only the () brackets? - if (N <= 2) - { - val = std::complex(0); - return true; - } - - size_t comma_loc = 0; - bool comma_found = false; - - for (size_t i = 0; i < N; ++i) - { - if (str[i] == ',') - { - comma_loc = i; - comma_found = true; - break; - } - } - - bool state = false; - - if (comma_found == false) - { - // only the real part is available - - const std::string token_real(&(str[1]), (Nm1 - 1)); - - T val_real; - - state = convert_token(val_real, token_real); // use the non-complex version of this function - - val = std::complex(val_real); - } - else - { - const std::string token_real(&(str[1]), (comma_loc - 1)); - const std::string token_imag(&(str[comma_loc + 1]), (Nm1 - 1 - comma_loc)); - - T val_real; - T val_imag; - - const bool state_real = convert_token(val_real, token_real); - const bool state_imag = convert_token(val_imag, token_imag); - - state = (state_real && state_imag); - - val = std::complex(val_real, val_imag); - } - - return state; - } - - //! Load a matrix in CSV text format (human readable) - template - inline bool - load_csv_ascii(arma::Mat &x, std::istream &f, std::string &) - { - // TODO: replace with more efficient implementation - - bool load_okay = f.good(); - - f.clear(); - const std::fstream::pos_type pos1 = f.tellg(); - - // - // work out the size - - arma::uword f_n_rows = 0; - arma::uword f_n_cols = 0; - - std::string line_string; - std::stringstream line_stream; - - std::string token; - - while (f.good() && load_okay) - { - std::getline(f, line_string); - - if (line_string.size() == 0) - { - break; - } - - line_stream.clear(); - line_stream.str(line_string); - - arma::uword line_n_cols = 0; - - while (line_stream.good()) - { - std::getline(line_stream, token, ','); - ++line_n_cols; - } - - if (f_n_cols < line_n_cols) - { - f_n_cols = line_n_cols; - } - - ++f_n_rows; - } - - f.clear(); - f.seekg(pos1); - - x.zeros(f_n_rows, f_n_cols); - - arma::uword row = 0; - - while (f.good()) - { - std::getline(f, line_string); - - if (line_string.size() == 0) - { - break; - } - - line_stream.clear(); - line_stream.str(line_string); - - arma::uword col = 0; - - while (line_stream.good()) - { - std::getline(line_stream, token, ','); - - convert_token(x.at(row, col), token); - - ++col; - } - - ++row; - } - - return load_okay; - } - - template - inline arma_cold bool - load_data(arma::Mat &x, const arma::file_type type, std::istream &f) - { - bool load_okay = false; - std::string err_msg; - std::string g = "y"; - switch (type) - { - case arma::auto_detect: - load_okay = true; - break; - - case arma::csv_ascii: - load_okay = load_csv_ascii(x, f, g); - case arma::raw_ascii: - load_okay = true; - break; - - case arma::arma_ascii: - load_okay = true; - break; - - case arma::coord_ascii: - load_okay = true; - break; - - case arma::raw_binary: - load_okay = true; - break; - - case arma::arma_binary: - load_okay = true; - break; - - case arma::pgm_binary: - load_okay = true; - break; - - case arma::hdf5_binary: - return true; - break; - - case arma::hdf5_binary_trans: // kept for compatibility with earlier versions of Armadillo - return true; - break; - - default: - load_okay = false; - } - - return load_okay; - } - } // namespace data -} // namespace mlpack - -#endif diff --git a/src/mlpack/core/data/test.cpp b/src/mlpack/core/data/test.cpp new file mode 100644 index 00000000000..e69de29bb2d From e2e25d3dab7355ed97a1f07617a2f1cf5bd52e42 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 2 Jun 2021 15:45:49 +0530 Subject: [PATCH 009/112] minor style changes --- src/mlpack/core/data/test.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/mlpack/core/data/test.cpp diff --git a/src/mlpack/core/data/test.cpp b/src/mlpack/core/data/test.cpp deleted file mode 100644 index e69de29bb2d..00000000000 From 89e2942760d70916bb79eab3a3a21a9254a91d86 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 2 Jun 2021 15:51:00 +0530 Subject: [PATCH 010/112] Add MatType to Load fucntions --- src/mlpack/core/data/load.hpp | 145 +++------------------------------- 1 file changed, 12 insertions(+), 133 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 9b54f43ce13..9c952940f49 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -1,7 +1,7 @@ /** * @file core/data/load.hpp * @author Ryan Curtin - * + * * Load an Armadillo matrix from file. This is necessary because Armadillo does * not transpose matrices on input, and it allows us to give better error * output. @@ -25,6 +25,10 @@ namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { +// We should not need extern definations +// anymore as we are removing the boost +// spirit + /** * Loads a matrix from file, guessing the filetype from the extension. This * will transpose the matrix at load time (unless the transpose parameter is set @@ -66,7 +70,7 @@ namespace data /** Functions to load and save matrices and models. */ { * @param inputLoadType Used to determine the type of file to load (default arma::auto_detect). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Mat& matrix, const bool fatal = false, @@ -102,91 +106,12 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading (default true). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::SpMat& matrix, const bool fatal = false, const bool transpose = true); -/** - * Don't document these with doxygen; these declarations aren't helpful to - * users. - * - * @cond - */ - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -// size_t and uword should be one of these three typedefs. -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -/** - * @endcond - */ - /** * Load a column vector from a file, guessing the filetype from the extension. * @@ -214,7 +139,7 @@ extern template bool Load(const std::string&, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Col& vec, const bool fatal = false); @@ -246,7 +171,7 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Row& rowvec, const bool fatal = false); @@ -283,59 +208,13 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading. * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Mat& matrix, DatasetMapper& info, const bool fatal = false, const bool transpose = true); -/** - * Don't document these with doxygen; they aren't helpful for users to know - * about. - * - * @cond - */ - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -/** - * @endcond - */ - /** * Load a model from a file, guessing the filetype from the extension, or, * optionally, loading the specified format. If automatic extension detection @@ -381,7 +260,7 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Mat& matrix, ImageInfo& info, @@ -396,7 +275,7 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::vector& files, arma::Mat& matrix, ImageInfo& info, From 994934b19aa171e5754362a161bfa9878f09b05c Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 2 Jun 2021 15:53:20 +0530 Subject: [PATCH 011/112] included csv_parser.hpp in core.hpp --- src/mlpack/core.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core.hpp b/src/mlpack/core.hpp index e96a772d61c..e05dc9f2083 100644 --- a/src/mlpack/core.hpp +++ b/src/mlpack/core.hpp @@ -90,7 +90,7 @@ #include #include #include -#include +#include // mlpack::backtrace only for linux #ifdef HAS_BFD_DL From e246209a55d4993707d35ecc693982630e488f7a Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 2 Jun 2021 16:02:24 +0530 Subject: [PATCH 012/112] changes --- src/mlpack/core/data/csv_parser.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index dda719c4019..8c109034fe2 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -90,6 +90,6 @@ namespace data } // namespace mlpack // Include implementation -#include "new_parser_impl.hpp" +#include "csv_parser_impl.hpp" #endif From 22ebc35ffabba2b58cc6d42b5358ac2f1f19a967 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Mon, 7 Jun 2021 14:15:50 +0530 Subject: [PATCH 013/112] Removed arma::file_type, changed template parameter to MatType only --- src/mlpack/core/data/csv_parser.hpp | 54 ++------- src/mlpack/core/data/csv_parser_impl.hpp | 148 +++++++++++++++-------- src/mlpack/core/data/load.hpp | 145 ++++++++++++++++++++-- src/mlpack/core/data/load_impl.hpp | 4 +- 4 files changed, 244 insertions(+), 107 deletions(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index 8c109034fe2..48dadc91b22 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -20,32 +20,13 @@ #ifndef MLPACK_CORE_DATA_CSV_PARSER_HPP #define MLPACK_CORE_DATA_CSV_PARSER_HPP +#include +#include + namespace mlpack { namespace data { - - // template - // inline - // bool - // convert_token(eT& val, const std::string& token); - - // template - // inline - // bool - // convert_token(std::complex& val, const std::string& token); - - // template - // inline - // bool - // load_csv_ascii(arma::Mat& x, std::istream& f, std::string&); - - // template - // inline - // arma_cold - // bool - // load_data(const std::string name, const arma::file_type type); - enum struct file_type : unsigned int { file_type_unknown, @@ -61,32 +42,17 @@ namespace data hdf5_binary_trans, //!< [DO NOT USE - deprecated] as per hdf5_binary, but save/load the data with columns transposed to rows coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) }; - - template - struct is_real - { - static constexpr bool value = false; - static constexpr bool yes = false; - static constexpr bool no = true; - }; - - template - inline bool ConvertToken(eT& val, const std::string& token); - - template - inline bool LoadCSV(arma::Mat& x, std::istream& f, std::string&); - - template - inline bool LoadCSV(std::string& name, std::string& err_msg); - template - inline bool Load(const std::string& name, const file_type type, const bool print_status); + template + bool ConvertToken(typename MatType::elem_type& val, const std::string& token); - template - inline bool LoadData(const std::string& name, const file_type type, const bool print_status); + template + bool LoadCSVV(MatType& x, std::fstream& f, std::string&); + template + bool LoadData(const std::string& name, MatType& x, const mlpack::data::file_type type); -} // namespace data + } // namespace data } // namespace mlpack // Include implementation diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index 81d3fedc13f..c4b5b13fc0c 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -24,19 +24,30 @@ namespace mlpack { namespace data { - template - inline bool convert_token(eT& val, const std::string& token) + + /** + * Given the address of a martix element(val) + * sets it equal to the provided value(token) + * example calling: convert_token(x.at(row, col), token) + */ + template + bool ConvertToken(typename MatType::elem_type& val, const std::string& token) { const size_t N = size_t(token.length()); if (N == 0) { - val = eT(0); + val = typename MatType::elem_type(0); + return true; } const char* str = token.c_str(); + // checking for nan, +inf, -inf + // in both upper and lower case + // using arma::Datum which basically + // contains all the physical constants if ((N == 3) || (N == 4)) { const bool neg = (str[0] == '-'); @@ -44,31 +55,71 @@ namespace data const size_t offset = ((neg || pos) && (N == 4)) ? 1 : 0; - // discuss about this fucntion + const char sig_a = str[offset]; + const char sig_b = str[offset+1]; + const char sig_c = str[offset+2]; + + if (((sig_a == 'i') || (sig_a == 'I')) && ((sig_b == 'n') || (sig_b == 'N')) + && ((sig_c == 'f') || (sig_c == 'F'))) + { + // val = if(neg == true) ? -INF : +INF + val = neg ? -(std::numeric_limits::infinity()) : std::numeric_limits::infinity(); + + return true; + } + else if (((sig_a == 'n') || (sig_a == 'N')) && ((sig_b == 'a') || (sig_b == 'A')) + && ((sig_c == 'n') || (sig_c == 'N'))) + { + val = std::numeric_limits::quiet_NaN(); + + return true; + } } char* endptr = nullptr; - if (is_real::value) + if (std::is_floating_point::value || std::is_integral::value) { - val = eT(std::strtod(str, &endptr)); + val = typename MatType::elem_type(std::strtod(str, &endptr)); + } + else + { + if (std::is_signed::value) + { + val = typename MatType::elem_type(std::strtoll(str, &endptr, 10)); + } + else + { + if (str[0] == '-') + { + val = typename MatType::elem_type(0); + return true; + } + + val = typename MatType::elem_type( std::strtoull(str, &endptr, 10) ); + } } - if(str == endptr) { return false; } + if (str == endptr) + { + return false; + } return true; } - template - inline bool load_csv_ascii(arma::Mat& x, std::istream& f, std::string&) + /** + * Loads the data from the csv file + * into the give MatType + */ + template + bool LoadCSVV(MatType& x, std::fstream& f, std::string&) { bool load_okay = f.good(); f.clear(); const std::fstream::pos_type pos1 = f.tellg(); - // use own implementation as you don't - // wanna depend on arma in core mlpack size_t f_n_rows = 0; size_t f_n_cols = 0; @@ -131,7 +182,7 @@ namespace data { std::getline(line_stream, token, ','); - convert_token(x.at(row, col), token); + ConvertToken(x.at(row, col), token); ++col; } @@ -141,60 +192,57 @@ namespace data return load_okay; } - template - inline bool LoadCSV(const std::string& name, std::string& err_msg) + template + bool LoadData(const std::string& name, MatType& x, const mlpack::data::file_type type) { + // bool load_okay = false; + std::string err_msg; + std::string print_status; std::fstream f; - arma::Mat x; f.open(name.c_str(), std::fstream::in); - bool load_okay = f.is_open(); - - if (load_okay == false) + switch (type) { - return false; - } + case mlpack::data::file_type::csv_ascii: + return LoadCSVV(x, f, print_status); - if (load_okay) - { - load_okay = LoadCSV(x, f, err_msg); - } + case mlpack::data::file_type::file_type_unknown: + return true; - f.close(); + case mlpack::data::file_type::auto_detect: + return true; - return load_okay; - } + case mlpack::data::file_type::raw_ascii: + return true; - template - inline - bool - Load(const std::string& name, const file_type type, const bool print_status) - { - bool load_okay = false; - std::string err_msg; + case mlpack::data::file_type::arma_ascii: + return true; - load_okay = LoadCSV(name, err_msg); + case mlpack::data::file_type::raw_binary: + return true; - return load_okay; - } + case mlpack::data::file_type::arma_binary: + return true; - template - inline bool LoadData(const std::string& name, const file_type type, const bool print_status) - { - bool load_okay = false; - std::string err_msg; + case mlpack::data::file_type::pgm_binary: + return true; + + case mlpack::data::file_type::ppm_binary: + return true; + + case mlpack::data::file_type::hdf5_binary: + return true; + + case mlpack::data::file_type::hdf5_binary_trans: + return true; + + case mlpack::data::file_type::coord_ascii: + return true; - switch (type) - { - case file_type::csv_ascii: - return Load(name, type, print_status); - - // For own implementation - // return load(name, type, print_status); - break; } + return false; } } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 9c952940f49..9b54f43ce13 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -1,7 +1,7 @@ /** * @file core/data/load.hpp * @author Ryan Curtin - * + * * Load an Armadillo matrix from file. This is necessary because Armadillo does * not transpose matrices on input, and it allows us to give better error * output. @@ -25,10 +25,6 @@ namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { -// We should not need extern definations -// anymore as we are removing the boost -// spirit - /** * Loads a matrix from file, guessing the filetype from the extension. This * will transpose the matrix at load time (unless the transpose parameter is set @@ -70,7 +66,7 @@ namespace data /** Functions to load and save matrices and models. */ { * @param inputLoadType Used to determine the type of file to load (default arma::auto_detect). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Mat& matrix, const bool fatal = false, @@ -106,12 +102,91 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading (default true). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::SpMat& matrix, const bool fatal = false, const bool transpose = true); +/** + * Don't document these with doxygen; these declarations aren't helpful to + * users. + * + * @cond + */ + +extern template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const arma::file_type); + +// size_t and uword should be one of these three typedefs. +extern template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const arma::file_type); + +extern template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const arma::file_type); + +extern template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const arma::file_type); + +extern template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const arma::file_type); + +extern template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const arma::file_type); + +extern template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const arma::file_type); + +extern template bool Load(const std::string&, + arma::SpMat&, + const bool, + const bool); + +extern template bool Load(const std::string&, + arma::SpMat&, + const bool, + const bool); + +extern template bool Load(const std::string&, + arma::SpMat&, + const bool, + const bool); + +extern template bool Load(const std::string&, + arma::SpMat&, + const bool, + const bool); + +extern template bool Load(const std::string&, + arma::SpMat&, + const bool, + const bool); + +/** + * @endcond + */ + /** * Load a column vector from a file, guessing the filetype from the extension. * @@ -139,7 +214,7 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Col& vec, const bool fatal = false); @@ -171,7 +246,7 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Row& rowvec, const bool fatal = false); @@ -208,13 +283,59 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading. * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Mat& matrix, DatasetMapper& info, const bool fatal = false, const bool transpose = true); +/** + * Don't document these with doxygen; they aren't helpful for users to know + * about. + * + * @cond + */ + +extern template bool Load( + const std::string&, + arma::Mat&, + DatasetMapper&, + const bool, + const bool); + +extern template bool Load( + const std::string&, + arma::Mat&, + DatasetMapper&, + const bool, + const bool); + +extern template bool Load( + const std::string&, + arma::Mat&, + DatasetMapper&, + const bool, + const bool); + +extern template bool Load( + const std::string&, + arma::Mat&, + DatasetMapper&, + const bool, + const bool); + +extern template bool Load( + const std::string&, + arma::Mat&, + DatasetMapper&, + const bool, + const bool); + +/** + * @endcond + */ + /** * Load a model from a file, guessing the filetype from the extension, or, * optionally, loading the specified format. If automatic extension detection @@ -260,7 +381,7 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, arma::Mat& matrix, ImageInfo& info, @@ -275,7 +396,7 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::vector& files, arma::Mat& matrix, ImageInfo& info, diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 344a8b6c13b..7ec55843684 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -164,8 +164,10 @@ bool Load(const std::string& filename, // We can't use the stream if the type is HDF5. bool success; + if (loadType != arma::hdf5_binary) - success = matrix.load(stream, loadType); + success = LoadData(filename, matrix, file_type::csv_ascii); + //success = matrix.load(stream, loadType); else success = matrix.load(filename, loadType); From a7456b1a3b226b7dfd81c5f58199c0712e492784 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 9 Jun 2021 12:44:35 +0530 Subject: [PATCH 014/112] Removing LoadData() --- src/mlpack/core/data/csv_parser.hpp | 31 ++++---- src/mlpack/core/data/csv_parser_impl.hpp | 94 ++++++------------------ src/mlpack/core/data/load_impl.hpp | 6 +- 3 files changed, 41 insertions(+), 90 deletions(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index 48dadc91b22..d0b51869a2d 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -30,30 +30,27 @@ namespace data enum struct file_type : unsigned int { file_type_unknown, - auto_detect, //!< attempt to automatically detect the file type - raw_ascii, //!< raw text (ASCII), without a header - arma_ascii, //!< Armadillo text format, with a header specifying matrix type and size - csv_ascii, //!< comma separated values (CSV), without a header - raw_binary, //!< raw binary format (machine dependent), without a header - arma_binary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size - pgm_binary, //!< Portable Grey Map (greyscale image) - ppm_binary, //!< Portable Pixel Map (colour image), used by the field and cube classes - hdf5_binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data - hdf5_binary_trans, //!< [DO NOT USE - deprecated] as per hdf5_binary, but save/load the data with columns transposed to rows + auto_detect, //!< attempt to automatically detect the file type + raw_ascii, //!< raw text (ASCII), without a header + arma_ascii, //!< Armadillo text format, with a header specifying matrix type and size + csv_ascii, //!< comma separated values (CSV), without a header + raw_binary, //!< raw binary format (machine dependent), without a header + arma_binary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size + pgm_binary, //!< Portable Grey Map (greyscale image) + ppm_binary, //!< Portable Pixel Map (colour image), used by the field and cube classes + hdf5_binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data + hdf5_binary_trans, //!< [DO NOT US deprecated] as per hdf5_binary, but save/load the data with columns transposed to rows coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) }; - + template bool ConvertToken(typename MatType::elem_type& val, const std::string& token); - + template bool LoadCSVV(MatType& x, std::fstream& f, std::string&); - - template - bool LoadData(const std::string& name, MatType& x, const mlpack::data::file_type type); - } // namespace data -} // namespace mlpack + } // namespace data +} // namespace mlpack // Include implementation #include "csv_parser_impl.hpp" diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index c4b5b13fc0c..4d5a8eb7cdb 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -36,10 +36,10 @@ namespace data const size_t N = size_t(token.length()); if (N == 0) - { + { val = typename MatType::elem_type(0); - return true; + return true; } const char* str = token.c_str(); @@ -55,20 +55,23 @@ namespace data const size_t offset = ((neg || pos) && (N == 4)) ? 1 : 0; - const char sig_a = str[offset]; + const char sig_a = str[offset]; const char sig_b = str[offset+1]; const char sig_c = str[offset+2]; - if (((sig_a == 'i') || (sig_a == 'I')) && ((sig_b == 'n') || (sig_b == 'N')) - && ((sig_c == 'f') || (sig_c == 'F'))) + if (((sig_a == 'i') || (sig_a == 'I')) && + ((sig_b == 'n') || (sig_b == 'N')) && + ((sig_c == 'f') || (sig_c == 'F'))) { - // val = if(neg == true) ? -INF : +INF - val = neg ? -(std::numeric_limits::infinity()) : std::numeric_limits::infinity(); - + // val = if(neg == true) ? -INF : +INF + val = neg ? -(std::numeric_limits::infinity()) : + std::numeric_limits::infinity(); + return true; } - else if (((sig_a == 'n') || (sig_a == 'N')) && ((sig_b == 'a') || (sig_b == 'A')) - && ((sig_c == 'n') || (sig_c == 'N'))) + else if (((sig_a == 'n') || (sig_a == 'N')) && + ((sig_b == 'a') || (sig_b == 'A')) && + ((sig_c == 'n') || (sig_c == 'N'))) { val = std::numeric_limits::quiet_NaN(); @@ -78,7 +81,8 @@ namespace data char* endptr = nullptr; - if (std::is_floating_point::value || std::is_integral::value) + if ((std::is_floating_point::value) || + (std::is_integral::value)) { val = typename MatType::elem_type(std::strtod(str, &endptr)); } @@ -96,8 +100,8 @@ namespace data return true; } - val = typename MatType::elem_type( std::strtoull(str, &endptr, 10) ); - } + val = typename MatType::elem_type( std::strtoull(str, &endptr, 10)); + } } if (str == endptr) @@ -128,13 +132,13 @@ namespace data std::string token; - while(f.good() && load_okay) + while (f.good() && load_okay) { std::getline(f, line_string); if (line_string.size() == 0) - { - break; + { + break; } line_stream.clear(); @@ -142,7 +146,7 @@ namespace data size_t line_n_cols = 0; - while(line_stream.good()) + while (line_stream.good()) { // reading each element of the row std::getline(line_stream, token, ','); @@ -153,7 +157,7 @@ namespace data { f_n_cols = line_n_cols; } - + ++f_n_rows; } @@ -189,61 +193,9 @@ namespace data ++row; } - return load_okay; + return load_okay; } - template - bool LoadData(const std::string& name, MatType& x, const mlpack::data::file_type type) - { - // bool load_okay = false; - std::string err_msg; - std::string print_status; - std::fstream f; - - f.open(name.c_str(), std::fstream::in); - - switch (type) - { - case mlpack::data::file_type::csv_ascii: - return LoadCSVV(x, f, print_status); - - case mlpack::data::file_type::file_type_unknown: - return true; - - case mlpack::data::file_type::auto_detect: - return true; - - case mlpack::data::file_type::raw_ascii: - return true; - - case mlpack::data::file_type::arma_ascii: - return true; - - case mlpack::data::file_type::raw_binary: - return true; - - case mlpack::data::file_type::arma_binary: - return true; - - case mlpack::data::file_type::pgm_binary: - return true; - - case mlpack::data::file_type::ppm_binary: - return true; - - case mlpack::data::file_type::hdf5_binary: - return true; - - case mlpack::data::file_type::hdf5_binary_trans: - return true; - - case mlpack::data::file_type::coord_ascii: - return true; - - break; - } - return false; - } } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 7ec55843684..4b44d732980 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -164,10 +164,12 @@ bool Load(const std::string& filename, // We can't use the stream if the type is HDF5. bool success; + std::string print_status; if (loadType != arma::hdf5_binary) - success = LoadData(filename, matrix, file_type::csv_ascii); - //success = matrix.load(stream, loadType); + success = LoadCSVV(matrix, stream, print_status); + // success = LoadData(filename, matrix, file_type::csv_ascii); + // success = matrix.load(stream, loadType); else success = matrix.load(filename, loadType); From 7ff9b3cf6f56d25fa0fbda50418618bfd8158ec8 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 10 Jun 2021 02:57:54 +0530 Subject: [PATCH 015/112] Temporary patch to handle other file types --- src/mlpack/core/data/load_impl.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 4b44d732980..f982d418ab6 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -167,7 +167,12 @@ bool Load(const std::string& filename, std::string print_status; if (loadType != arma::hdf5_binary) - success = LoadCSVV(matrix, stream, print_status); + { + if(loadType == arma::csv_ascii) + success = LoadCSVV(matrix, stream, print_status); + else + matrix.load(stream, loadType); + } // success = LoadData(filename, matrix, file_type::csv_ascii); // success = matrix.load(stream, loadType); else From 6c6aa1be6d99808f1bc540a6367463d54debf3b2 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 10 Jun 2021 03:08:14 +0530 Subject: [PATCH 016/112] fix --- src/mlpack/core/data/load_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index f982d418ab6..0845c9d44b4 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -171,7 +171,7 @@ bool Load(const std::string& filename, if(loadType == arma::csv_ascii) success = LoadCSVV(matrix, stream, print_status); else - matrix.load(stream, loadType); + success = matrix.load(stream, loadType); } // success = LoadData(filename, matrix, file_type::csv_ascii); // success = matrix.load(stream, loadType); From ad1a49555b875ca0544ae9e53e9814f922d5c983 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 16 Jun 2021 16:00:54 +0530 Subject: [PATCH 017/112] Added doxygen comments --- src/mlpack/core/data/csv_parser.hpp | 93 +++++++++++++++++------- src/mlpack/core/data/csv_parser_impl.hpp | 37 +++++----- src/mlpack/core/data/load_impl.hpp | 3 +- 3 files changed, 86 insertions(+), 47 deletions(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index d0b51869a2d..b5d01ee2e6e 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -1,5 +1,15 @@ -/* Fucntions defined in this files originate from armadillo - * This file is originated from armadillo and adapted for mlpack +/** + * @file core/data/csv_parser.hpp + * @author Gopi M. Tatiraju + * + * This csv parser is designed by taking reference from armadillo's csv parser. + * In this mlpack's version, all the arma dependencies were removed or replaced + * accordingly, making the parser totally independent of armadillo. + * + * This parser will be totally independent to any linear algebra library. + * This can be used to load data into any matrix, i.e. arma and bandicoot + * in future. + * * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) * Copyright 2008-2016 National ICT Australia (NICTA) @@ -15,41 +25,70 @@ * See the License for the specific language governing permissions and * limitations under the License. * ------------------------------------------------------------------------ -*/ - + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ #ifndef MLPACK_CORE_DATA_CSV_PARSER_HPP #define MLPACK_CORE_DATA_CSV_PARSER_HPP #include -#include -namespace mlpack -{ -namespace data +namespace mlpack{ +namespace data{ +enum struct file_type : unsigned int { - enum struct file_type : unsigned int - { - file_type_unknown, - auto_detect, //!< attempt to automatically detect the file type - raw_ascii, //!< raw text (ASCII), without a header - arma_ascii, //!< Armadillo text format, with a header specifying matrix type and size - csv_ascii, //!< comma separated values (CSV), without a header - raw_binary, //!< raw binary format (machine dependent), without a header - arma_binary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size - pgm_binary, //!< Portable Grey Map (greyscale image) - ppm_binary, //!< Portable Pixel Map (colour image), used by the field and cube classes - hdf5_binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data - hdf5_binary_trans, //!< [DO NOT US deprecated] as per hdf5_binary, but save/load the data with columns transposed to rows - coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) + file_type_unknown, + auto_detect, //!< attempt to automatically detect the file type + raw_ascii, //!< raw text (ASCII), without a header + arma_ascii, //!< Armadillo text format, with a header specifying matrix type and size + csv_ascii, //!< comma separated values (CSV), without a header + raw_binary, //!< raw binary format (machine dependent), without a header + arma_binary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size + pgm_binary, //!< Portable Grey Map (greyscale image) + ppm_binary, //!< Portable Pixel Map (colour image), used by the field and cube classes + hdf5_binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data + hdf5_binary_trans, //!< [DO NOT US deprecated] as per hdf5_binary, but save/load the data with columns transposed to rows + coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) }; - template - bool ConvertToken(typename MatType::elem_type& val, const std::string& token); +/** + * Convert the given string token to assigned datatype and assign + * this value to the given address. The address here will be a + * matrix location. + * + * Token is always read as a string, if the given token is +/-INF or NAN + * it converts them to infinity and NAN using numeric_limits. + * + * @param val Token's value will be assigned to this address + * @param token Value which should be assigned + */ +template +bool ConvertToken(typename MatType::elem_type& val, const std::string& token); - template - bool LoadCSVV(MatType& x, std::fstream& f, std::string&); +/** + * Returns a bool value showing whether data was loaded successfully or not. + * + * Parses the file and loads the data into the given matrix. It will make the + * first parse to determine the number of cols and rows in the given file. + * Once the rows and cols are fixed we initialize a matrix of size(which we + * calculated in the first pass) and fill it with zeros. In the second pass + * it converts each value to required datatype and sets it equal to val. + * + * Using MatType as template parameter here so that in future if mlpack + * decides to use any other linear algebra library or want to support + * multiple linear algebra libraries, we can make the transition easily. + * This is to make the csv parser as generic as possible. + * + * @param x Matrix in which data will be loaded + * @param f File stream to access the data file + */ +template +bool LoadCSVV(MatType& x, std::fstream& f); - } // namespace data +} // namespace data } // namespace mlpack // Include implementation diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index 4d5a8eb7cdb..4c5b7636b24 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -1,4 +1,15 @@ -/* This file is originated from armadillo and adapted for mlpack +/** + * @file core/data/csv_parser_impl.hpp + * @author Gopi M. Tatiraju + * + * This csv parser is designed by taking reference from armadillo's csv parser. + * In this mlpack's version, all the arma dependencies were removed or replaced + * accordingly, making the parser totally independent of armadillo. + * + * This parser will be totally independent to any linear algebra library. + * This can be used to load data into any matrix, i.e. arma and bandicoot + * in future. + * * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) * Copyright 2008-2016 National ICT Australia (NICTA) @@ -14,7 +25,12 @@ * See the License for the specific language governing permissions and * limitations under the License. * ------------------------------------------------------------------------ -*/ + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ #ifndef MLPACK_CORE_DATA_CSV_PARSER_IMPL_HPP #define MLPACK_CORE_DATA_CSV_PARSER_IMPL_HPP @@ -38,16 +54,11 @@ namespace data if (N == 0) { val = typename MatType::elem_type(0); - return true; } const char* str = token.c_str(); - // checking for nan, +inf, -inf - // in both upper and lower case - // using arma::Datum which basically - // contains all the physical constants if ((N == 3) || (N == 4)) { const bool neg = (str[0] == '-'); @@ -63,10 +74,8 @@ namespace data ((sig_b == 'n') || (sig_b == 'N')) && ((sig_c == 'f') || (sig_c == 'F'))) { - // val = if(neg == true) ? -INF : +INF val = neg ? -(std::numeric_limits::infinity()) : std::numeric_limits::infinity(); - return true; } else if (((sig_a == 'n') || (sig_a == 'N')) && @@ -74,7 +83,6 @@ namespace data ((sig_c == 'n') || (sig_c == 'N'))) { val = std::numeric_limits::quiet_NaN(); - return true; } } @@ -108,16 +116,11 @@ namespace data { return false; } - return true; } - /** - * Loads the data from the csv file - * into the give MatType - */ template - bool LoadCSVV(MatType& x, std::fstream& f, std::string&) + bool LoadCSVV(MatType& x, std::fstream& f) { bool load_okay = f.good(); @@ -148,7 +151,6 @@ namespace data while (line_stream.good()) { - // reading each element of the row std::getline(line_stream, token, ','); ++line_n_cols; } @@ -192,7 +194,6 @@ namespace data } ++row; } - return load_okay; } diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 0845c9d44b4..32259315b7b 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -164,12 +164,11 @@ bool Load(const std::string& filename, // We can't use the stream if the type is HDF5. bool success; - std::string print_status; if (loadType != arma::hdf5_binary) { if(loadType == arma::csv_ascii) - success = LoadCSVV(matrix, stream, print_status); + success = LoadCSVV(matrix, stream); else success = matrix.load(stream, loadType); } From 9a52c63098bd86eec39a12f3ba54946ae42f6de6 Mon Sep 17 00:00:00 2001 From: Gopi M Tatiraju Date: Wed, 16 Jun 2021 16:04:42 +0530 Subject: [PATCH 018/112] Update csv_parser.hpp style fix --- src/mlpack/core/data/csv_parser.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index b5d01ee2e6e..0f7337ef3f6 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -42,16 +42,16 @@ enum struct file_type : unsigned int { file_type_unknown, auto_detect, //!< attempt to automatically detect the file type - raw_ascii, //!< raw text (ASCII), without a header - arma_ascii, //!< Armadillo text format, with a header specifying matrix type and size - csv_ascii, //!< comma separated values (CSV), without a header - raw_binary, //!< raw binary format (machine dependent), without a header + raw_ascii, //!< raw text (ASCII), without a header + arma_ascii, //!< Armadillo text format, with a header specifying matrix type and size + csv_ascii, //!< comma separated values (CSV), without a header + raw_binary, //!< raw binary format (machine dependent), without a header arma_binary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size - pgm_binary, //!< Portable Grey Map (greyscale image) - ppm_binary, //!< Portable Pixel Map (colour image), used by the field and cube classes + pgm_binary, //!< Portable Grey Map (greyscale image) + ppm_binary, //!< Portable Pixel Map (colour image), used by the field and cube classes hdf5_binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data hdf5_binary_trans, //!< [DO NOT US deprecated] as per hdf5_binary, but save/load the data with columns transposed to rows - coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) + coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) }; /** From 4dc7fec2b9a02f8e332929c09db063674b15a3af Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 16 Jun 2021 21:02:34 +0530 Subject: [PATCH 019/112] added mlpack file type --- src/mlpack/core/data/csv_parser.hpp | 37 ++++++++++++++-------- src/mlpack/core/data/csv_parser_impl.hpp | 12 +++----- src/mlpack/core/data/load.cpp | 39 +++++++++++++++++++++++- src/mlpack/core/data/load.hpp | 17 ++++++----- src/mlpack/core/data/load_impl.hpp | 17 ++++++----- 5 files changed, 85 insertions(+), 37 deletions(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index 0f7337ef3f6..d5d3e4ba799 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -1,5 +1,6 @@ /** * @file core/data/csv_parser.hpp + * @author Conrad Sanderson * @author Gopi M. Tatiraju * * This csv parser is designed by taking reference from armadillo's csv parser. @@ -40,20 +41,32 @@ namespace mlpack{ namespace data{ enum struct file_type : unsigned int { - file_type_unknown, - auto_detect, //!< attempt to automatically detect the file type - raw_ascii, //!< raw text (ASCII), without a header - arma_ascii, //!< Armadillo text format, with a header specifying matrix type and size - csv_ascii, //!< comma separated values (CSV), without a header - raw_binary, //!< raw binary format (machine dependent), without a header - arma_binary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size - pgm_binary, //!< Portable Grey Map (greyscale image) - ppm_binary, //!< Portable Pixel Map (colour image), used by the field and cube classes - hdf5_binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data - hdf5_binary_trans, //!< [DO NOT US deprecated] as per hdf5_binary, but save/load the data with columns transposed to rows - coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) + mlp_file_type_unknown, + mlp_auto_detect, //!< attempt to automatically detect the file type + mlp_raw_ascii, //!< raw text (ASCII), without a header + mlp_arma_ascii, //!< Armadillo text format, with a header specifying matrix type and size + mlp_csv_ascii, //!< comma separated values (CSV), without a header + mlp_raw_binary, //!< raw binary format (machine dependent), without a header + mlp_arma_binary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size + mlp_pgm_binary, //!< Portable Grey Map (greyscale image) + mlp_ppm_binary, //!< Portable Pixel Map (colour image), used by the field and cube classes + mlp_hdf5_binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data + mlp_hdf5_binary_trans, //!< [DO NOT US deprecated] as per hdf5_binary, but save/load the data with columns transposed to rows + mlp_coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) }; +static constexpr file_type mlp_file_type_unknown = file_type::mlp_file_type_unknown; +static constexpr file_type mlp_auto_detect = file_type::mlp_auto_detect; +static constexpr file_type mlp_raw_ascii = file_type::mlp_raw_ascii; +static constexpr file_type mlp_arma_ascii = file_type::mlp_arma_ascii; +static constexpr file_type mlp_csv_ascii = file_type::mlp_csv_ascii; +static constexpr file_type mlp_raw_binary = file_type::mlp_raw_binary; +static constexpr file_type mlp_arma_binary = file_type::mlp_arma_binary; +static constexpr file_type mlp_pgm_binary = file_type::mlp_pgm_binary; +static constexpr file_type mlp_ppm_binary = file_type::mlp_ppm_binary; +static constexpr file_type mlp_hdf5_binary = file_type::mlp_hdf5_binary; +static constexpr file_type mlp_hdf5_binary_trans = file_type::mlp_hdf5_binary_trans; +static constexpr file_type mlp_coord_ascii = file_type::mlp_coord_ascii; /** * Convert the given string token to assigned datatype and assign * this value to the given address. The address here will be a diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index 4c5b7636b24..545c7484fa1 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -1,5 +1,6 @@ /** * @file core/data/csv_parser_impl.hpp + * @author Conrad Sanderson * @author Gopi M. Tatiraju * * This csv parser is designed by taking reference from armadillo's csv parser. @@ -40,7 +41,6 @@ namespace mlpack { namespace data { - /** * Given the address of a martix element(val) * sets it equal to the provided value(token) @@ -125,25 +125,23 @@ namespace data bool load_okay = f.good(); f.clear(); + const std::fstream::pos_type pos1 = f.tellg(); - + size_t f_n_rows = 0; size_t f_n_cols = 0; - + std::string line_string; std::stringstream line_stream; - std::string token; while (f.good() && load_okay) { std::getline(f, line_string); - if (line_string.size() == 0) { break; } - line_stream.clear(); line_stream.str(line_string); @@ -187,9 +185,7 @@ namespace data while (line_stream.good()) { std::getline(line_stream, token, ','); - ConvertToken(x.at(row, col), token); - ++col; } ++row; diff --git a/src/mlpack/core/data/load.cpp b/src/mlpack/core/data/load.cpp index 71ad35cb643..e0290ca10e2 100644 --- a/src/mlpack/core/data/load.cpp +++ b/src/mlpack/core/data/load.cpp @@ -14,7 +14,7 @@ namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { - +/* template bool Load(const std::string&, arma::Mat&, const bool, @@ -50,7 +50,44 @@ template bool Load(const std::string&, const bool, const bool, const arma::file_type); +*/ +//--------------------------------------------------------- +template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const file_type); + +template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const file_type); + +template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const file_type); +template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const file_type); + +template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const file_type); + +template bool Load(const std::string&, + arma::Mat&, + const bool, + const bool, + const file_type); +// -------------------------------------------------------------------- template bool Load(const std::string&, arma::SpMat&, const bool, diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 9b54f43ce13..063431843e6 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -21,6 +21,7 @@ #include "format.hpp" #include "dataset_mapper.hpp" #include "image_info.hpp" +#include "csv_parser.hpp" namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { @@ -71,7 +72,7 @@ bool Load(const std::string& filename, arma::Mat& matrix, const bool fatal = false, const bool transpose = true, - const arma::file_type inputLoadType = arma::auto_detect); + const file_type inputLoadType = file_type::mlp_auto_detect); /** * Loads a sparse matrix from file, using arma::coord_ascii format. This @@ -119,44 +120,44 @@ extern template bool Load(const std::string&, arma::Mat&, const bool, const bool, - const arma::file_type); + const file_type); // size_t and uword should be one of these three typedefs. extern template bool Load(const std::string&, arma::Mat&, const bool, const bool, - const arma::file_type); + const file_type); extern template bool Load(const std::string&, arma::Mat&, const bool, const bool, - const arma::file_type); + const file_type); extern template bool Load(const std::string&, arma::Mat&, const bool, const bool, - const arma::file_type); + const file_type); extern template bool Load(const std::string&, arma::Mat&, const bool, const bool, - const arma::file_type); + const file_type); extern template bool Load(const std::string&, arma::Mat&, const bool, const bool, - const arma::file_type); + const file_type); extern template bool Load(const std::string&, arma::Mat&, const bool, const bool, - const arma::file_type); + const file_type); extern template bool Load(const std::string&, arma::SpMat&, diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 32259315b7b..93a4f202df1 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -22,6 +22,7 @@ #include "load.hpp" #include "extension.hpp" #include "detect_file_type.hpp" +#include "csv_parser.hpp" #include #include @@ -90,7 +91,7 @@ bool Load(const std::string& filename, arma::Mat& matrix, const bool fatal, const bool transpose, - const arma::file_type inputLoadType) + const file_type inputLoadType) { Timer::Start("loading_data"); @@ -113,14 +114,14 @@ bool Load(const std::string& filename, return false; } - arma::file_type loadType = inputLoadType; + file_type loadType = inputLoadType; std::string stringType; - if (inputLoadType == arma::auto_detect) + if (inputLoadType == file_type::mlp_auto_detect) { // Attempt to auto-detect the type from the given file. loadType = AutoDetect(stream, filename); // Provide error if we don't know the type. - if (loadType == arma::file_type_unknown) + if (loadType == file_type::mlp_file_type_unknown) { Timer::Stop("loading_data"); if (fatal) @@ -137,7 +138,7 @@ bool Load(const std::string& filename, stringType = GetStringType(loadType); #ifndef ARMA_USE_HDF5 - if (inputLoadType == arma::hdf5_binary) + if (inputLoadType == file_type::mlp_hdf5_binary) { // Ensure that HDF5 is supported. Timer::Stop("loading_data"); @@ -155,7 +156,7 @@ bool Load(const std::string& filename, #endif // Try to load the file; but if it's raw_binary, it could be a problem. - if (loadType == arma::raw_binary) + if (loadType == file_type::mlp_raw_binary) Log::Warn << "Loading '" << filename << "' as " << stringType << "; " << "but this may not be the actual filetype!" << std::endl; else @@ -165,9 +166,9 @@ bool Load(const std::string& filename, // We can't use the stream if the type is HDF5. bool success; - if (loadType != arma::hdf5_binary) + if (loadType != file_type::mlp_hdf5_binary) { - if(loadType == arma::csv_ascii) + if(loadType == file_type::mlp_csv_ascii) success = LoadCSVV(matrix, stream); else success = matrix.load(stream, loadType); From cbdab31eefee57e4ba8bcaa69d4e31e2c2d6e753 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 16 Jun 2021 21:32:15 +0530 Subject: [PATCH 020/112] adding mlpack file type in detect_file_type --- src/mlpack/core/data/detect_file_type.cpp | 65 +++++++++++++---------- src/mlpack/core/data/detect_file_type.hpp | 13 ++--- 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/src/mlpack/core/data/detect_file_type.cpp b/src/mlpack/core/data/detect_file_type.cpp index c00954f1c07..f2c8a497c15 100644 --- a/src/mlpack/core/data/detect_file_type.cpp +++ b/src/mlpack/core/data/detect_file_type.cpp @@ -29,6 +29,17 @@ namespace data { */ std::string GetStringType(const arma::file_type& type) { + switch (type) + { + case file_type::mlp_csv_ascii: return "CSV data"; + case file_type::mlp_raw_ascii: return "raw ASCII formatted data"; + case file_type::mlp_raw_binary: return "raw binary formatted data"; + case file_type::mlp_arma_ascii: return "Armadillo ASCII formatted data"; + case file_type::mlp_arma_binary: return "Armadillo binary formatted data"; + case file_type::mlp_pgm_binary: return "PGM data"; + case file_type::mlp_hdf5_binary: return "HDF5 data"; + default: return ""; + } switch (type) { case arma::csv_ascii: return "CSV data"; @@ -53,7 +64,7 @@ std::string GetStringType(const arma::file_type& type) * * @param f Opened istream to look into to guess the file type. */ -arma::file_type GuessFileType(std::istream& f) +file_type GuessFileType(std::istream& f) { f.clear(); const std::fstream::pos_type pos1 = f.tellg(); @@ -74,7 +85,7 @@ arma::file_type GuessFileType(std::istream& f) // Handle empty files. if (nMax == 0) - return arma::file_type_unknown; + return file_type::mlp_file_type_unknown; const arma::uword nUse = std::min(nMax, arma::uword(4096)); @@ -92,7 +103,7 @@ arma::file_type GuessFileType(std::istream& f) if (!loadOkay) { delete[] dataMem; - return arma::file_type_unknown; + return file_type::mlp_file_type_unknown; } bool hasBinary = false; @@ -168,12 +179,12 @@ arma::file_type GuessFileType(std::istream& f) delete[] dataMem; if (hasBinary) - return arma::raw_binary; + return file_type::mlp_raw_binary; if (hasComma && (hasBracket == false)) - return arma::csv_ascii; + return file_type::mlp_csv_ascii; - return arma::raw_ascii; + return file_type::mlp_raw_ascii; } /** @@ -189,22 +200,22 @@ arma::file_type GuessFileType(std::istream& f) * @param filename Name of the file. * @return The detected file type. */ -arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) +file_type AutoDetect(std::fstream& stream, const std::string& filename) { // Get the extension. std::string extension = Extension(filename); - arma::file_type detectedLoadType = arma::file_type_unknown; + file_type detectedLoadType = file_type::mlp_file_type_unknown; if (extension == "csv" || extension == "tsv") { detectedLoadType = GuessFileType(stream); - if (detectedLoadType == arma::csv_ascii) + if (detectedLoadType == file_type::mlp_csv_ascii) { if (extension == "tsv") Log::Warn << "'" << filename << "' is comma-separated, not " "tab-separated!" << std::endl; } - else if (detectedLoadType == arma::raw_ascii) // .csv file can be tsv. + else if (detectedLoadType == file_type::mlp_raw_ascii) // .csv file can be tsv. { if (extension == "csv") { @@ -231,7 +242,7 @@ arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) } else { - detectedLoadType = arma::file_type_unknown; + detectedLoadType = file_type::mlp_file_type_unknown; } } else if (extension == "txt") @@ -251,15 +262,15 @@ arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) if (rawHeader == ARMA_MAT_TXT) { - detectedLoadType = arma::arma_ascii; + detectedLoadType = file_type::mlp_arma_ascii; } else // It's not arma_ascii. Now we let Armadillo guess. { detectedLoadType = GuessFileType(stream); - if (detectedLoadType != arma::raw_ascii && - detectedLoadType != arma::csv_ascii) - detectedLoadType = arma::file_type_unknown; + if (detectedLoadType != file_type::mlp_raw_ascii && + detectedLoadType != file_type::mlp_csv_ascii) + detectedLoadType = file_type::mlp_file_type_unknown; } } else if (extension == "bin") @@ -277,25 +288,25 @@ arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) if (rawHeader == ARMA_MAT_BIN) { - detectedLoadType = arma::arma_binary; + detectedLoadType = file_type::mlp_arma_binary; } else // We can only assume it's raw binary. { - detectedLoadType = arma::raw_binary; + detectedLoadType = file_type::mlp_raw_binary; } } else if (extension == "pgm") { - detectedLoadType = arma::pgm_binary; + detectedLoadType = file_type::mlp_pgm_binary; } else if (extension == "h5" || extension == "hdf5" || extension == "hdf" || extension == "he5") { - detectedLoadType = arma::hdf5_binary; + detectedLoadType = file_type::mlp_hdf5_binary; } else // Unknown extension... { - detectedLoadType = arma::file_type_unknown; + detectedLoadType = file_type::mlp_file_type_unknown; } return detectedLoadType; @@ -307,34 +318,34 @@ arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) * @param filename Name of the file whose type we should detect. * @return Detected type of file. */ -arma::file_type DetectFromExtension(const std::string& filename) +file_type DetectFromExtension(const std::string& filename) { const std::string extension = Extension(filename); if (extension == "csv") { - return arma::csv_ascii; + return file_type::mlp_csv_ascii; } else if (extension == "txt") { - return arma::raw_ascii; + return file_type::mlp_raw_ascii; } else if (extension == "bin") { - return arma::arma_binary; + return file_type::mlp_arma_binary; } else if (extension == "pgm") { - return arma::pgm_binary; + return file_type::mlp_pgm_binary; } else if (extension == "h5" || extension == "hdf5" || extension == "hdf" || extension == "he5") { - return arma::hdf5_binary; + return file_type::mlp_hdf5_binary; } else { - return arma::file_type_unknown; + return file_type::mlp_file_type_unknown; } } diff --git a/src/mlpack/core/data/detect_file_type.hpp b/src/mlpack/core/data/detect_file_type.hpp index 8856de29fee..6cf2ca124d3 100644 --- a/src/mlpack/core/data/detect_file_type.hpp +++ b/src/mlpack/core/data/detect_file_type.hpp @@ -15,6 +15,8 @@ #ifndef MLPACK_CORE_DATA_DETECT_FILE_TYPE_HPP #define MLPACK_CORE_DATA_DETECT_FILE_TYPE_HPP +#include "csv_parser.hpp" + namespace mlpack { namespace data { @@ -23,8 +25,7 @@ namespace data { * * @param type Type to get the logical name of. */ -std::string GetStringType(const arma::file_type& type); - +std::string GetStringType(const file_type& type); /** * Given an istream, attempt to guess the file type. This is taken originally * from Armadillo's function guess_file_type_internal(), but we avoid using @@ -36,7 +37,7 @@ std::string GetStringType(const arma::file_type& type); * * @param f Opened istream to look into to guess the file type. */ -arma::file_type GuessFileType(std::istream& f); +file_type GuessFileType(std::istream& f); /** * Attempt to auto-detect the type of a file given its extension, and by @@ -51,8 +52,8 @@ arma::file_type GuessFileType(std::istream& f); * @param filename Name of the file. * @return The detected file type. arma::file_type_unknown if unknown. */ -arma::file_type AutoDetect(std::fstream& stream, - const std::string& filename); +file_type AutoDetect(std::fstream& stream, + const std::string& filename); /** * Return the type based only on the extension. @@ -60,7 +61,7 @@ arma::file_type AutoDetect(std::fstream& stream, * @param filename Name of the file whose type we should detect. * @return Detected type of file. arma::file_type_unknown if unknown. */ -arma::file_type DetectFromExtension(const std::string& filename); +file_type DetectFromExtension(const std::string& filename); } // namespace data } // namespace mlpack From 739e212654431ed185ce263220e310ed5479a359 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 17 Jun 2021 02:56:34 +0530 Subject: [PATCH 021/112] Replacing arma file type with mlpack file type --- src/mlpack/core/data/csv_parser.hpp | 25 ++++------ src/mlpack/core/data/csv_parser_impl.hpp | 58 +++++++++++++++++++++++ src/mlpack/core/data/detect_file_type.cpp | 13 +---- src/mlpack/core/data/load_impl.hpp | 4 +- src/mlpack/core/data/save.hpp | 3 +- src/mlpack/core/data/save_impl.hpp | 38 +++++++-------- 6 files changed, 92 insertions(+), 49 deletions(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index d5d3e4ba799..767d45ce5d5 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -39,6 +39,7 @@ namespace mlpack{ namespace data{ + enum struct file_type : unsigned int { mlp_file_type_unknown, @@ -55,18 +56,15 @@ enum struct file_type : unsigned int mlp_coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) }; -static constexpr file_type mlp_file_type_unknown = file_type::mlp_file_type_unknown; -static constexpr file_type mlp_auto_detect = file_type::mlp_auto_detect; -static constexpr file_type mlp_raw_ascii = file_type::mlp_raw_ascii; -static constexpr file_type mlp_arma_ascii = file_type::mlp_arma_ascii; -static constexpr file_type mlp_csv_ascii = file_type::mlp_csv_ascii; -static constexpr file_type mlp_raw_binary = file_type::mlp_raw_binary; -static constexpr file_type mlp_arma_binary = file_type::mlp_arma_binary; -static constexpr file_type mlp_pgm_binary = file_type::mlp_pgm_binary; -static constexpr file_type mlp_ppm_binary = file_type::mlp_ppm_binary; -static constexpr file_type mlp_hdf5_binary = file_type::mlp_hdf5_binary; -static constexpr file_type mlp_hdf5_binary_trans = file_type::mlp_hdf5_binary_trans; -static constexpr file_type mlp_coord_ascii = file_type::mlp_coord_ascii; +/** + * WHere should I place this fucntion? + * This fucntion is used to convert mlpack file type to respective + * arma file type. + * + * @param type Mlpack's file_type which will we converted to arma's file_type + */ +arma::file_type ToArmaFileType(file_type& type); + /** * Convert the given string token to assigned datatype and assign * this value to the given address. The address here will be a @@ -104,7 +102,4 @@ bool LoadCSVV(MatType& x, std::fstream& f); } // namespace data } // namespace mlpack -// Include implementation -#include "csv_parser_impl.hpp" - #endif diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index 545c7484fa1..eeede66c10f 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -41,6 +41,64 @@ namespace mlpack { namespace data { + arma::file_type ToArmaFileType(file_type& type) + { + switch(type) + { + case file_type::mlp_file_type_unknown: + return arma::file_type_unknown; + break; + + case file_type::mlp_auto_detect: + return arma::auto_detect; + break; + + case file_type::mlp_raw_ascii: + return arma::raw_ascii; + break; + + case file_type::mlp_arma_ascii: + return arma::arma_ascii; + break; + + case file_type::mlp_csv_ascii: + return arma::csv_ascii; + break; + + case file_type::mlp_raw_binary: + return arma::raw_binary; + break; + + case file_type::mlp_arma_binary: + return arma::arma_binary; + break; + + case file_type::mlp_pgm_binary: + return arma::pgm_binary; + break; + + case file_type::mlp_ppm_binary: + return arma::ppm_binary; + break; + + case file_type::mlp_hdf5_binary: + return arma::hdf5_binary; + break; + + case file_type::mlp_hdf5_binary_trans: + return arma::hdf5_binary_trans; + break; + + case file_type::mlp_coord_ascii: + return arma::coord_ascii; + break; + + default: + return arma::file_type_unknown; + break; + } + } + /** * Given the address of a martix element(val) * sets it equal to the provided value(token) diff --git a/src/mlpack/core/data/detect_file_type.cpp b/src/mlpack/core/data/detect_file_type.cpp index f2c8a497c15..6e8dd01a671 100644 --- a/src/mlpack/core/data/detect_file_type.cpp +++ b/src/mlpack/core/data/detect_file_type.cpp @@ -27,7 +27,7 @@ namespace data { * * @param type Type to get the logical name of. */ -std::string GetStringType(const arma::file_type& type) +std::string GetStringType(const file_type& type) { switch (type) { @@ -40,17 +40,6 @@ std::string GetStringType(const arma::file_type& type) case file_type::mlp_hdf5_binary: return "HDF5 data"; default: return ""; } - switch (type) - { - case arma::csv_ascii: return "CSV data"; - case arma::raw_ascii: return "raw ASCII formatted data"; - case arma::raw_binary: return "raw binary formatted data"; - case arma::arma_ascii: return "Armadillo ASCII formatted data"; - case arma::arma_binary: return "Armadillo binary formatted data"; - case arma::pgm_binary: return "PGM data"; - case arma::hdf5_binary: return "HDF5 data"; - default: return ""; - } } /** diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 93a4f202df1..530b669e434 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -171,12 +171,12 @@ bool Load(const std::string& filename, if(loadType == file_type::mlp_csv_ascii) success = LoadCSVV(matrix, stream); else - success = matrix.load(stream, loadType); + success = matrix.load(stream, ToArmaFileType(loadType)); } // success = LoadData(filename, matrix, file_type::csv_ascii); // success = matrix.load(stream, loadType); else - success = matrix.load(filename, loadType); + success = matrix.load(filename, ToArmaFileType(loadType)); if (!success) { diff --git a/src/mlpack/core/data/save.hpp b/src/mlpack/core/data/save.hpp index 6f8889cf8e2..016a4e92f82 100644 --- a/src/mlpack/core/data/save.hpp +++ b/src/mlpack/core/data/save.hpp @@ -20,6 +20,7 @@ #include "format.hpp" #include "image_info.hpp" +#include "csv_parser.hpp" namespace mlpack { namespace data /** Functions to load and save matrices. */ { @@ -64,7 +65,7 @@ bool Save(const std::string& filename, const arma::Mat& matrix, const bool fatal = false, bool transpose = true, - arma::file_type inputSaveType = arma::auto_detect); + file_type inputSaveType = file_type::mlp_auto_detect); /** * Saves a sparse matrix to file, guessing the filetype from the diff --git a/src/mlpack/core/data/save_impl.hpp b/src/mlpack/core/data/save_impl.hpp index c51e666b1f9..b9a4668e4aa 100644 --- a/src/mlpack/core/data/save_impl.hpp +++ b/src/mlpack/core/data/save_impl.hpp @@ -28,7 +28,7 @@ template bool Save(const std::string& filename, const arma::Col& vec, const bool fatal, - arma::file_type inputSaveType) + file_type inputSaveType) { // Don't transpose: one observation per line (for CSVs at least). return Save(filename, vec, fatal, false, inputSaveType); @@ -38,7 +38,7 @@ template bool Save(const std::string& filename, const arma::Row& rowvec, const bool fatal, - arma::file_type inputSaveType) + file_type inputSaveType) { return Save(filename, rowvec, fatal, true, inputSaveType); } @@ -48,18 +48,18 @@ bool Save(const std::string& filename, const arma::Mat& matrix, const bool fatal, bool transpose, - arma::file_type inputSaveType) + file_type inputSaveType) { Timer::Start("saving_data"); - arma::file_type saveType = inputSaveType; + file_type saveType = inputSaveType; std::string stringType = ""; - if (inputSaveType == arma::auto_detect) + if (inputSaveType == file_type::mlp_auto_detect) { // Detect the file type using only the extension. saveType = DetectFromExtension(filename); - if (saveType == arma::file_type_unknown) + if (saveType == file_type::mlp_file_type_unknown) { if (fatal) Log::Fatal << "Could not detect type of file '" << filename << "' for " @@ -105,11 +105,11 @@ bool Save(const std::string& filename, #ifdef ARMA_USE_HDF5 // We can't save with streams for HDF5. - const bool success = (saveType == arma::hdf5_binary) ? - tmp.quiet_save(filename, saveType) : - tmp.quiet_save(stream, saveType); + const bool success = (saveType == file_type::mlp_hdf5_binary) ? + tmp.quiet_save(filename, ToArmaFileType(saveType)) : + tmp.quiet_save(stream, ToArmaFileType(saveType)); #else - const bool success = tmp.quiet_save(stream, saveType); + const bool success = tmp.quiet_save(stream, ToArmaFileType(saveType)); #endif if (!success) { @@ -126,11 +126,11 @@ bool Save(const std::string& filename, { #ifdef ARMA_USE_HDF5 // We can't save with streams for HDF5. - const bool success = (saveType == arma::hdf5_binary) ? - matrix.quiet_save(filename, saveType) : - matrix.quiet_save(stream, saveType); + const bool success = (saveType == file_type::mlp_hdf5_binary) ? + matrix.quiet_save(filename, ToArmaFileType(saveType)) : + matrix.quiet_save(stream, ToArmaFileType(saveType)); #else - const bool success = matrix.quiet_save(stream, saveType); + const bool success = matrix.quiet_save(stream, ToArmaFileType(saveType)); #endif if (!success) { @@ -195,23 +195,23 @@ bool Save(const std::string& filename, } bool unknownType = false; - arma::file_type saveType; + file_type saveType; std::string stringType; if (extension == "txt" || extension == "tsv") { - saveType = arma::coord_ascii; + saveType = file_type::mlp_coord_ascii; stringType = "raw ASCII formatted data"; } else if (extension == "bin") { - saveType = arma::arma_binary; + saveType = file_type::mlp_arma_binary; stringType = "Armadillo binary formatted data"; } else { unknownType = true; - saveType = arma::raw_binary; // Won't be used; prevent a warning. + saveType = file_type::mlp_raw_binary; // Won't be used; prevent a warning. stringType = ""; } @@ -241,7 +241,7 @@ bool Save(const std::string& filename, tmp = trans(matrix); } - const bool success = tmp.quiet_save(stream, saveType); + const bool success = tmp.quiet_save(stream, ToArmaFileType(saveType)); if (!success) { Timer::Stop("saving_data"); From 5f4cbba7636086062eebc017b36c3760fcd54770 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 20 Jun 2021 20:17:47 +0530 Subject: [PATCH 022/112] Created new file named type.hpp for mlpack file types and utility fucntions realted to file types. Changed the naming converntion of file type to mlpack's style. Changed fucntion name LoadCSVV -> LoadCSVFile --- src/mlpack/core/data/CMakeLists.txt | 2 + src/mlpack/core/data/csv_parser.hpp | 31 +------ src/mlpack/core/data/csv_parser_impl.hpp | 64 ++------------ src/mlpack/core/data/detect_file_type.cpp | 62 ++++++------- src/mlpack/core/data/load.hpp | 2 +- src/mlpack/core/data/load_impl.hpp | 13 +-- src/mlpack/core/data/save.hpp | 3 +- src/mlpack/core/data/save_impl.hpp | 14 +-- src/mlpack/core/data/types.hpp | 76 ++++++++++++++++ src/mlpack/core/data/types_impl.hpp | 102 ++++++++++++++++++++++ 10 files changed, 237 insertions(+), 132 deletions(-) create mode 100644 src/mlpack/core/data/types.hpp create mode 100644 src/mlpack/core/data/types_impl.hpp diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index 746e4cd0209..106e37320ea 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -36,6 +36,8 @@ set(SOURCES one_hot_encoding_impl.hpp csv_parser.hpp csv_parser_impl.hpp + types.hpp + types_impl.hpp ) # add directory name to sources diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index 767d45ce5d5..8b87451cbdb 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -35,36 +35,11 @@ #ifndef MLPACK_CORE_DATA_CSV_PARSER_HPP #define MLPACK_CORE_DATA_CSV_PARSER_HPP -#include +#include "types.hpp" namespace mlpack{ namespace data{ -enum struct file_type : unsigned int -{ - mlp_file_type_unknown, - mlp_auto_detect, //!< attempt to automatically detect the file type - mlp_raw_ascii, //!< raw text (ASCII), without a header - mlp_arma_ascii, //!< Armadillo text format, with a header specifying matrix type and size - mlp_csv_ascii, //!< comma separated values (CSV), without a header - mlp_raw_binary, //!< raw binary format (machine dependent), without a header - mlp_arma_binary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size - mlp_pgm_binary, //!< Portable Grey Map (greyscale image) - mlp_ppm_binary, //!< Portable Pixel Map (colour image), used by the field and cube classes - mlp_hdf5_binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data - mlp_hdf5_binary_trans, //!< [DO NOT US deprecated] as per hdf5_binary, but save/load the data with columns transposed to rows - mlp_coord_ascii //!< simple co-ordinate format for sparse matrices (indices start at zero) - }; - -/** - * WHere should I place this fucntion? - * This fucntion is used to convert mlpack file type to respective - * arma file type. - * - * @param type Mlpack's file_type which will we converted to arma's file_type - */ -arma::file_type ToArmaFileType(file_type& type); - /** * Convert the given string token to assigned datatype and assign * this value to the given address. The address here will be a @@ -97,9 +72,11 @@ bool ConvertToken(typename MatType::elem_type& val, const std::string& token); * @param f File stream to access the data file */ template -bool LoadCSVV(MatType& x, std::fstream& f); +bool LoadCSVFile(MatType& x, std::fstream& f); } // namespace data } // namespace mlpack +#include "csv_parser_impl.hpp" + #endif diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index eeede66c10f..c147533f18f 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -41,64 +41,6 @@ namespace mlpack { namespace data { - arma::file_type ToArmaFileType(file_type& type) - { - switch(type) - { - case file_type::mlp_file_type_unknown: - return arma::file_type_unknown; - break; - - case file_type::mlp_auto_detect: - return arma::auto_detect; - break; - - case file_type::mlp_raw_ascii: - return arma::raw_ascii; - break; - - case file_type::mlp_arma_ascii: - return arma::arma_ascii; - break; - - case file_type::mlp_csv_ascii: - return arma::csv_ascii; - break; - - case file_type::mlp_raw_binary: - return arma::raw_binary; - break; - - case file_type::mlp_arma_binary: - return arma::arma_binary; - break; - - case file_type::mlp_pgm_binary: - return arma::pgm_binary; - break; - - case file_type::mlp_ppm_binary: - return arma::ppm_binary; - break; - - case file_type::mlp_hdf5_binary: - return arma::hdf5_binary; - break; - - case file_type::mlp_hdf5_binary_trans: - return arma::hdf5_binary_trans; - break; - - case file_type::mlp_coord_ascii: - return arma::coord_ascii; - break; - - default: - return arma::file_type_unknown; - break; - } - } - /** * Given the address of a martix element(val) * sets it equal to the provided value(token) @@ -177,8 +119,12 @@ namespace data return true; } + /** + * Returns a bool value showing whether data was loaded successfully or not. + * Parses the file and loads the data into the given matrix. + */ template - bool LoadCSVV(MatType& x, std::fstream& f) + bool LoadCSVFile(MatType& x, std::fstream& f) { bool load_okay = f.good(); diff --git a/src/mlpack/core/data/detect_file_type.cpp b/src/mlpack/core/data/detect_file_type.cpp index 6e8dd01a671..129f29b7130 100644 --- a/src/mlpack/core/data/detect_file_type.cpp +++ b/src/mlpack/core/data/detect_file_type.cpp @@ -31,13 +31,13 @@ std::string GetStringType(const file_type& type) { switch (type) { - case file_type::mlp_csv_ascii: return "CSV data"; - case file_type::mlp_raw_ascii: return "raw ASCII formatted data"; - case file_type::mlp_raw_binary: return "raw binary formatted data"; - case file_type::mlp_arma_ascii: return "Armadillo ASCII formatted data"; - case file_type::mlp_arma_binary: return "Armadillo binary formatted data"; - case file_type::mlp_pgm_binary: return "PGM data"; - case file_type::mlp_hdf5_binary: return "HDF5 data"; + case file_type::CSVASCII: return "CSV data"; + case file_type::RawASCII: return "raw ASCII formatted data"; + case file_type::RawBinary: return "raw binary formatted data"; + case file_type::ArmaASCII: return "Armadillo ASCII formatted data"; + case file_type::ArmaBinary: return "Armadillo binary formatted data"; + case file_type::PGMBinary: return "PGM data"; + case file_type::HDF5Binary: return "HDF5 data"; default: return ""; } } @@ -74,7 +74,7 @@ file_type GuessFileType(std::istream& f) // Handle empty files. if (nMax == 0) - return file_type::mlp_file_type_unknown; + return file_type::FileTypeUnknown; const arma::uword nUse = std::min(nMax, arma::uword(4096)); @@ -92,7 +92,7 @@ file_type GuessFileType(std::istream& f) if (!loadOkay) { delete[] dataMem; - return file_type::mlp_file_type_unknown; + return file_type::FileTypeUnknown; } bool hasBinary = false; @@ -168,12 +168,12 @@ file_type GuessFileType(std::istream& f) delete[] dataMem; if (hasBinary) - return file_type::mlp_raw_binary; + return file_type::RawBinary; if (hasComma && (hasBracket == false)) - return file_type::mlp_csv_ascii; + return file_type::CSVASCII; - return file_type::mlp_raw_ascii; + return file_type::RawASCII; } /** @@ -193,18 +193,18 @@ file_type AutoDetect(std::fstream& stream, const std::string& filename) { // Get the extension. std::string extension = Extension(filename); - file_type detectedLoadType = file_type::mlp_file_type_unknown; + file_type detectedLoadType = file_type::FileTypeUnknown; if (extension == "csv" || extension == "tsv") { detectedLoadType = GuessFileType(stream); - if (detectedLoadType == file_type::mlp_csv_ascii) + if (detectedLoadType == file_type::CSVASCII) { if (extension == "tsv") Log::Warn << "'" << filename << "' is comma-separated, not " "tab-separated!" << std::endl; } - else if (detectedLoadType == file_type::mlp_raw_ascii) // .csv file can be tsv. + else if (detectedLoadType == file_type::RawASCII) // .csv file can be tsv. { if (extension == "csv") { @@ -231,7 +231,7 @@ file_type AutoDetect(std::fstream& stream, const std::string& filename) } else { - detectedLoadType = file_type::mlp_file_type_unknown; + detectedLoadType = file_type::FileTypeUnknown; } } else if (extension == "txt") @@ -251,15 +251,15 @@ file_type AutoDetect(std::fstream& stream, const std::string& filename) if (rawHeader == ARMA_MAT_TXT) { - detectedLoadType = file_type::mlp_arma_ascii; + detectedLoadType = file_type::ArmaASCII; } else // It's not arma_ascii. Now we let Armadillo guess. { detectedLoadType = GuessFileType(stream); - if (detectedLoadType != file_type::mlp_raw_ascii && - detectedLoadType != file_type::mlp_csv_ascii) - detectedLoadType = file_type::mlp_file_type_unknown; + if (detectedLoadType != file_type::RawASCII && + detectedLoadType != file_type::CSVASCII) + detectedLoadType = file_type::FileTypeUnknown; } } else if (extension == "bin") @@ -277,25 +277,25 @@ file_type AutoDetect(std::fstream& stream, const std::string& filename) if (rawHeader == ARMA_MAT_BIN) { - detectedLoadType = file_type::mlp_arma_binary; + detectedLoadType = file_type::ArmaBinary; } else // We can only assume it's raw binary. { - detectedLoadType = file_type::mlp_raw_binary; + detectedLoadType = file_type::RawBinary; } } else if (extension == "pgm") { - detectedLoadType = file_type::mlp_pgm_binary; + detectedLoadType = file_type::PGMBinary; } else if (extension == "h5" || extension == "hdf5" || extension == "hdf" || extension == "he5") { - detectedLoadType = file_type::mlp_hdf5_binary; + detectedLoadType = file_type::HDF5Binary; } else // Unknown extension... { - detectedLoadType = file_type::mlp_file_type_unknown; + detectedLoadType = file_type::FileTypeUnknown; } return detectedLoadType; @@ -313,28 +313,28 @@ file_type DetectFromExtension(const std::string& filename) if (extension == "csv") { - return file_type::mlp_csv_ascii; + return file_type::CSVASCII; } else if (extension == "txt") { - return file_type::mlp_raw_ascii; + return file_type::RawASCII; } else if (extension == "bin") { - return file_type::mlp_arma_binary; + return file_type::ArmaBinary; } else if (extension == "pgm") { - return file_type::mlp_pgm_binary; + return file_type::PGMBinary; } else if (extension == "h5" || extension == "hdf5" || extension == "hdf" || extension == "he5") { - return file_type::mlp_hdf5_binary; + return file_type::HDF5Binary; } else { - return file_type::mlp_file_type_unknown; + return file_type::FileTypeUnknown; } } diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 063431843e6..e43f912e3ef 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -72,7 +72,7 @@ bool Load(const std::string& filename, arma::Mat& matrix, const bool fatal = false, const bool transpose = true, - const file_type inputLoadType = file_type::mlp_auto_detect); + const file_type inputLoadType = file_type::AutoDetect); /** * Loads a sparse matrix from file, using arma::coord_ascii format. This diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 530b669e434..e166317a179 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -23,6 +23,7 @@ #include "extension.hpp" #include "detect_file_type.hpp" #include "csv_parser.hpp" +#include "types.hpp" #include #include @@ -116,12 +117,12 @@ bool Load(const std::string& filename, file_type loadType = inputLoadType; std::string stringType; - if (inputLoadType == file_type::mlp_auto_detect) + if (inputLoadType == file_type::AutoDetect) { // Attempt to auto-detect the type from the given file. loadType = AutoDetect(stream, filename); // Provide error if we don't know the type. - if (loadType == file_type::mlp_file_type_unknown) + if (loadType == file_type::FileTypeUnknown) { Timer::Stop("loading_data"); if (fatal) @@ -138,7 +139,7 @@ bool Load(const std::string& filename, stringType = GetStringType(loadType); #ifndef ARMA_USE_HDF5 - if (inputLoadType == file_type::mlp_hdf5_binary) + if (inputLoadType == file_type::HDF5Binary) { // Ensure that HDF5 is supported. Timer::Stop("loading_data"); @@ -156,7 +157,7 @@ bool Load(const std::string& filename, #endif // Try to load the file; but if it's raw_binary, it could be a problem. - if (loadType == file_type::mlp_raw_binary) + if (loadType == file_type::RawBinary) Log::Warn << "Loading '" << filename << "' as " << stringType << "; " << "but this may not be the actual filetype!" << std::endl; else @@ -166,9 +167,9 @@ bool Load(const std::string& filename, // We can't use the stream if the type is HDF5. bool success; - if (loadType != file_type::mlp_hdf5_binary) + if (loadType != file_type::HDF5Binary) { - if(loadType == file_type::mlp_csv_ascii) + if(loadType == file_type::CSVASCII) success = LoadCSVV(matrix, stream); else success = matrix.load(stream, ToArmaFileType(loadType)); diff --git a/src/mlpack/core/data/save.hpp b/src/mlpack/core/data/save.hpp index 016a4e92f82..1e3f5a3486a 100644 --- a/src/mlpack/core/data/save.hpp +++ b/src/mlpack/core/data/save.hpp @@ -21,6 +21,7 @@ #include "format.hpp" #include "image_info.hpp" #include "csv_parser.hpp" +#include "types.hpp" namespace mlpack { namespace data /** Functions to load and save matrices. */ { @@ -65,7 +66,7 @@ bool Save(const std::string& filename, const arma::Mat& matrix, const bool fatal = false, bool transpose = true, - file_type inputSaveType = file_type::mlp_auto_detect); + file_type inputSaveType = file_type::AutoDetect); /** * Saves a sparse matrix to file, guessing the filetype from the diff --git a/src/mlpack/core/data/save_impl.hpp b/src/mlpack/core/data/save_impl.hpp index b9a4668e4aa..61bbd80942b 100644 --- a/src/mlpack/core/data/save_impl.hpp +++ b/src/mlpack/core/data/save_impl.hpp @@ -55,11 +55,11 @@ bool Save(const std::string& filename, file_type saveType = inputSaveType; std::string stringType = ""; - if (inputSaveType == file_type::mlp_auto_detect) + if (inputSaveType == file_type::AutoDetect) { // Detect the file type using only the extension. saveType = DetectFromExtension(filename); - if (saveType == file_type::mlp_file_type_unknown) + if (saveType == file_type::FileTypeUnknown) { if (fatal) Log::Fatal << "Could not detect type of file '" << filename << "' for " @@ -105,7 +105,7 @@ bool Save(const std::string& filename, #ifdef ARMA_USE_HDF5 // We can't save with streams for HDF5. - const bool success = (saveType == file_type::mlp_hdf5_binary) ? + const bool success = (saveType == file_type::HDF5Binary) ? tmp.quiet_save(filename, ToArmaFileType(saveType)) : tmp.quiet_save(stream, ToArmaFileType(saveType)); #else @@ -126,7 +126,7 @@ bool Save(const std::string& filename, { #ifdef ARMA_USE_HDF5 // We can't save with streams for HDF5. - const bool success = (saveType == file_type::mlp_hdf5_binary) ? + const bool success = (saveType == file_type::HDF5Binary) ? matrix.quiet_save(filename, ToArmaFileType(saveType)) : matrix.quiet_save(stream, ToArmaFileType(saveType)); #else @@ -200,18 +200,18 @@ bool Save(const std::string& filename, if (extension == "txt" || extension == "tsv") { - saveType = file_type::mlp_coord_ascii; + saveType = file_type::CoordASCII; stringType = "raw ASCII formatted data"; } else if (extension == "bin") { - saveType = file_type::mlp_arma_binary; + saveType = file_type::ArmaBinary; stringType = "Armadillo binary formatted data"; } else { unknownType = true; - saveType = file_type::mlp_raw_binary; // Won't be used; prevent a warning. + saveType = file_type::RawBinary; // Won't be used; prevent a warning. stringType = ""; } diff --git a/src/mlpack/core/data/types.hpp b/src/mlpack/core/data/types.hpp new file mode 100644 index 00000000000..a59b2e3d050 --- /dev/null +++ b/src/mlpack/core/data/types.hpp @@ -0,0 +1,76 @@ +/** + * @file core/data/types.hpp + * @author Gopi M. Tatiraju + * + * This file contains utilitiy fucntions related to types of data. + * We have adapted all the standard types which are available in armadillo. + * + * This file also contains functions to convery external file types to mlpack + * file types. In future if we the user of mlpack needs support of an external + * linear algebra library like armadillo, all fucntions related to handling the + * types goes here. + * + * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp + * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) + * Copyright 2008-2016 National ICT Australia (NICTA) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ------------------------------------------------------------------------ + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MLPACK_CORE_DATA_TYPES_HPP +#define MLPACK_CORE_DATA_TYPES_HPP + +#include +#include + +namespace mlpack +{ +namespace data +{ + +enum struct file_type +{ + FileTypeUnknown, + AutoDetect, //!< attempt to automatically detect the file type + RawASCII, //!< raw text (ASCII), without a header + ArmaASCII, //!< Armadillo text format, with a header specifying matrix type and size + CSVASCII, //!< comma separated values (CSV), without a header + RawBinary, //!< raw binary format (machine dependent), without a header + ArmaBinary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size + PGMBinary, //!< Portable Grey Map (greyscale image) + PPMBinary, //!< Portable Pixel Map (colour image), used by the field and cube classes + HDF5Binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data + CoordASCII //!< simple co-ordinate format for sparse matrices (indices start at zero) +}; + +/** + * WHhere should I place this fucntion? + * This fucntion is used to convert mlpack file type to respective + * arma file type. + * + * @param type Mlpack's file_type which will we converted to arma's file_type + */ +inline arma::file_type ToArmaFileType(file_type& type); + +} // namespace data +} // namespace mlpack + +#include "types_impl.hpp" + +#endif + diff --git a/src/mlpack/core/data/types_impl.hpp b/src/mlpack/core/data/types_impl.hpp new file mode 100644 index 00000000000..9df7b7d5c41 --- /dev/null +++ b/src/mlpack/core/data/types_impl.hpp @@ -0,0 +1,102 @@ +/** + * @file core/data/types.hpp + * @author Gopi M. Tatiraju + * + * This file contains utilitiy fucntions related to types of data. + * We have adapted all the standard types which are available in armadillo. + * + * This file also contains functions to convery external file types to mlpack + * file types. In future if we the user of mlpack needs support of an external + * linear algebra library like armadillo, all fucntions related to handling the + * types goes here. + * + * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp + * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) + * Copyright 2008-2016 National ICT Australia (NICTA) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ------------------------------------------------------------------------ + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MLPACK_CORE_DATA_TYPES_IMPL_HPP +#define MLPACK_CORE_DATA_TYPES_IMPL_HPP + +#include "types.hpp" + +namespace mlpack +{ +namespace data +{ + +inline arma::file_type ToArmaFileType(file_type& type) +{ + switch(type) + { + case file_type::FileTypeUnknown: + return arma::file_type_unknown; + break; + + case file_type::AutoDetect: + return arma::auto_detect; + break; + + case file_type::RawASCII: + return arma::raw_ascii; + break; + + case file_type::ArmaASCII: + return arma::arma_ascii; + break; + + case file_type::CSVASCII: + return arma::csv_ascii; + break; + + case file_type::RawBinary: + return arma::raw_binary; + break; + + case file_type::ArmaBinary: + return arma::arma_binary; + break; + + case file_type::PGMBinary: + return arma::pgm_binary; + break; + + case file_type::PPMBinary: + return arma::ppm_binary; + break; + + case file_type::HDF5Binary: + return arma::hdf5_binary; + break; + + case file_type::CoordASCII: + return arma::coord_ascii; + break; + + default: + return arma::file_type_unknown; + break; + } +} + +} // namespace data +} // namespace mlpack + +#endif From 585dbd9d6f6d5e3ace99735dfa0d02b15d67056e Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 20 Jun 2021 20:43:33 +0530 Subject: [PATCH 023/112] Minor Fix --- src/mlpack/core/data/csv_parser_impl.hpp | 2 +- src/mlpack/core/data/load_impl.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index c147533f18f..1c010b121f6 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -168,7 +168,7 @@ namespace data f.clear(); f.seekg(pos1); - x.zeros(f_n_rows, f_n_cols); + x.set_size(f_n_rows, f_n_cols); size_t row = 0; diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index e166317a179..37b429eb0d4 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -170,7 +170,7 @@ bool Load(const std::string& filename, if (loadType != file_type::HDF5Binary) { if(loadType == file_type::CSVASCII) - success = LoadCSVV(matrix, stream); + success = LoadCSVFile(matrix, stream); else success = matrix.load(stream, ToArmaFileType(loadType)); } From c3907bb2485d2d5f3e5f8839bb385487341552e6 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 20 Jun 2021 21:01:39 +0530 Subject: [PATCH 024/112] Removed load.cpp file --- src/mlpack/core/data/CMakeLists.txt | 1 - src/mlpack/core/data/load.cpp | 161 ---------------------------- 2 files changed, 162 deletions(-) delete mode 100644 src/mlpack/core/data/load.cpp diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index 106e37320ea..0f188703b65 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -17,7 +17,6 @@ set(SOURCES load_model_impl.hpp load_vec_impl.hpp load_impl.hpp - load.cpp load_arff.hpp load_arff_impl.hpp normalize_labels.hpp diff --git a/src/mlpack/core/data/load.cpp b/src/mlpack/core/data/load.cpp deleted file mode 100644 index e0290ca10e2..00000000000 --- a/src/mlpack/core/data/load.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/** - * @file core/data/load.cpp - * @author Tham Ngap Wei - * - * Force instantiation of some Load() overloads to reduce compile time. - * - * mlpack is free software; you may redistribute it and/or modify it under the - * terms of the 3-clause BSD license. You should have received a copy of the - * 3-clause BSD license along with mlpack. If not, see - * http://www.opensource.org/licenses/BSD-3-Clause for more information. - */ -#include "load.hpp" -#include "load_impl.hpp" - -namespace mlpack { -namespace data /** Functions to load and save matrices and models. */ { -/* -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); -*/ -//--------------------------------------------------------- -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); -// -------------------------------------------------------------------- -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -} // namespace data -} // namespace mlpack From 6d6616f619348c59b1531785d454c016e699e74f Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 20 Jun 2021 22:40:56 +0530 Subject: [PATCH 025/112] Commenting declarations in load.hpp --- src/mlpack/core/data/load.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index e43f912e3ef..1e4fda0f240 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -115,7 +115,7 @@ bool Load(const std::string& filename, * * @cond */ - +/* extern template bool Load(const std::string&, arma::Mat&, const bool, @@ -184,6 +184,7 @@ extern template bool Load(const std::string&, const bool, const bool); +*/ /** * @endcond */ @@ -298,6 +299,7 @@ bool Load(const std::string& filename, * @cond */ +/* extern template bool Load( const std::string&, arma::Mat&, @@ -332,7 +334,7 @@ extern template bool Load( DatasetMapper&, const bool, const bool); - +*/ /** * @endcond */ From 68281ea20f01fd4b7761822e671663f500efd88b Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Tue, 22 Jun 2021 16:42:55 +0530 Subject: [PATCH 026/112] Minor changes --- src/mlpack/core/data/csv_parser.hpp | 20 ++--- src/mlpack/core/data/load.hpp | 131 +--------------------------- src/mlpack/core/data/load_impl.hpp | 6 +- src/mlpack/core/data/save.hpp | 4 +- src/mlpack/core/data/save_impl.hpp | 6 +- src/mlpack/core/data/types.hpp | 16 ---- src/mlpack/core/data/types_impl.hpp | 16 ---- 7 files changed, 18 insertions(+), 181 deletions(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index 8b87451cbdb..360f3845aea 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -57,17 +57,15 @@ bool ConvertToken(typename MatType::elem_type& val, const std::string& token); /** * Returns a bool value showing whether data was loaded successfully or not. * - * Parses the file and loads the data into the given matrix. It will make the - * first parse to determine the number of cols and rows in the given file. - * Once the rows and cols are fixed we initialize a matrix of size(which we - * calculated in the first pass) and fill it with zeros. In the second pass - * it converts each value to required datatype and sets it equal to val. - * - * Using MatType as template parameter here so that in future if mlpack - * decides to use any other linear algebra library or want to support - * multiple linear algebra libraries, we can make the transition easily. - * This is to make the csv parser as generic as possible. - * + * Parses a csv file and loads the data into a given matrix. In the first pass, + * the function will determine the number of cols and rows in the given file. + * Once the rows and cols are fixed we initialize the matrix with zeros. In + * the second pass, the function converts each value to required datatype + * and sets it equal to val. + * + * This function uses MatType as template parameter in order to provide + * support for any type of matrices from any linear algebra library. + * * @param x Matrix in which data will be loaded * @param f File stream to access the data file */ diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 1e4fda0f240..2441e9559a0 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -67,9 +67,9 @@ namespace data /** Functions to load and save matrices and models. */ { * @param inputLoadType Used to determine the type of file to load (default arma::auto_detect). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, - arma::Mat& matrix, + MatType& matrix, const bool fatal = false, const bool transpose = true, const file_type inputLoadType = file_type::AutoDetect); @@ -109,86 +109,6 @@ bool Load(const std::string& filename, const bool fatal = false, const bool transpose = true); -/** - * Don't document these with doxygen; these declarations aren't helpful to - * users. - * - * @cond - */ -/* -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -// size_t and uword should be one of these three typedefs. -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const file_type); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -*/ -/** - * @endcond - */ - /** * Load a column vector from a file, guessing the filetype from the extension. * @@ -292,53 +212,6 @@ bool Load(const std::string& filename, const bool fatal = false, const bool transpose = true); -/** - * Don't document these with doxygen; they aren't helpful for users to know - * about. - * - * @cond - */ - -/* -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); -*/ -/** - * @endcond - */ - /** * Load a model from a file, guessing the filetype from the extension, or, * optionally, loading the specified format. If automatic extension detection diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 37b429eb0d4..16153053791 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -87,9 +87,9 @@ bool inline inplace_transpose(MatType& X, bool fatal) } } -template +template bool Load(const std::string& filename, - arma::Mat& matrix, + MatType& matrix, const bool fatal, const bool transpose, const file_type inputLoadType) @@ -174,8 +174,6 @@ bool Load(const std::string& filename, else success = matrix.load(stream, ToArmaFileType(loadType)); } - // success = LoadData(filename, matrix, file_type::csv_ascii); - // success = matrix.load(stream, loadType); else success = matrix.load(filename, ToArmaFileType(loadType)); diff --git a/src/mlpack/core/data/save.hpp b/src/mlpack/core/data/save.hpp index 1e3f5a3486a..84cc1b22c00 100644 --- a/src/mlpack/core/data/save.hpp +++ b/src/mlpack/core/data/save.hpp @@ -61,9 +61,9 @@ namespace data /** Functions to load and save matrices. */ { * @param inputSaveType File type to save to (defaults to arma::auto_detect). * @return Boolean value indicating success or failure of save. */ -template +template bool Save(const std::string& filename, - const arma::Mat& matrix, + const MatType& matrix, const bool fatal = false, bool transpose = true, file_type inputSaveType = file_type::AutoDetect); diff --git a/src/mlpack/core/data/save_impl.hpp b/src/mlpack/core/data/save_impl.hpp index 61bbd80942b..cdbf45c509a 100644 --- a/src/mlpack/core/data/save_impl.hpp +++ b/src/mlpack/core/data/save_impl.hpp @@ -43,9 +43,9 @@ bool Save(const std::string& filename, return Save(filename, rowvec, fatal, true, inputSaveType); } -template +template bool Save(const std::string& filename, - const arma::Mat& matrix, + const MatType& matrix, const bool fatal, bool transpose, file_type inputSaveType) @@ -101,7 +101,7 @@ bool Save(const std::string& filename, // Transpose the matrix. if (transpose) { - arma::Mat tmp = trans(matrix); + MatType tmp = trans(matrix); #ifdef ARMA_USE_HDF5 // We can't save with streams for HDF5. diff --git a/src/mlpack/core/data/types.hpp b/src/mlpack/core/data/types.hpp index a59b2e3d050..c4e09177180 100644 --- a/src/mlpack/core/data/types.hpp +++ b/src/mlpack/core/data/types.hpp @@ -10,22 +10,6 @@ * linear algebra library like armadillo, all fucntions related to handling the * types goes here. * - * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp - * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) - * Copyright 2008-2016 National ICT Australia (NICTA) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ------------------------------------------------------------------------ - * * mlpack is free software; you may redistribute it and/or modify it under the * terms of the 3-clause BSD license. You should have received a copy of the * 3-clause BSD license along with mlpack. If not, see diff --git a/src/mlpack/core/data/types_impl.hpp b/src/mlpack/core/data/types_impl.hpp index 9df7b7d5c41..2f779cdda22 100644 --- a/src/mlpack/core/data/types_impl.hpp +++ b/src/mlpack/core/data/types_impl.hpp @@ -10,22 +10,6 @@ * linear algebra library like armadillo, all fucntions related to handling the * types goes here. * - * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp - * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) - * Copyright 2008-2016 National ICT Australia (NICTA) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ------------------------------------------------------------------------ - * * mlpack is free software; you may redistribute it and/or modify it under the * terms of the 3-clause BSD license. You should have received a copy of the * 3-clause BSD license along with mlpack. If not, see From d08b4e4885ee9eef9e2201b4d0ae63227be8235e Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Tue, 22 Jun 2021 17:48:25 +0530 Subject: [PATCH 027/112] Changes in type checking --- src/mlpack/core/data/csv_parser_impl.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index 1c010b121f6..e86badaf439 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -89,12 +89,11 @@ namespace data char* endptr = nullptr; - if ((std::is_floating_point::value) || - (std::is_integral::value)) + if (std::is_floating_point::value) { val = typename MatType::elem_type(std::strtod(str, &endptr)); } - else + else if (std::is_integral::value) { if (std::is_signed::value) { From 159e177312c3cea51323b93f2d42aad04d011bf4 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 24 Jun 2021 15:38:30 +0530 Subject: [PATCH 028/112] SFINAE for load() --- src/mlpack/core/data/load.hpp | 21 +++++++++++++-------- src/mlpack/core/data/load_impl.hpp | 8 +++++--- src/mlpack/core/data/load_vec_impl.hpp | 22 ++++++++++++---------- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 2441e9559a0..23ee5fdd6c7 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -67,7 +67,8 @@ namespace data /** Functions to load and save matrices and models. */ { * @param inputLoadType Used to determine the type of file to load (default arma::auto_detect). * @return Boolean value indicating success or failure of load. */ -template +template::value>::type> bool Load(const std::string& filename, MatType& matrix, const bool fatal = false, @@ -103,9 +104,10 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading (default true). * @return Boolean value indicating success or failure of load. */ -template +template::value>::type> bool Load(const std::string& filename, - arma::SpMat& matrix, + MatType& matrix, const bool fatal = false, const bool transpose = true); @@ -136,9 +138,10 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template +template::value>::type> bool Load(const std::string& filename, - arma::Col& vec, + MatType& vec, const bool fatal = false); /** @@ -168,9 +171,10 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template +template::value>::type> bool Load(const std::string& filename, - arma::Row& rowvec, + MatType& rowvec, const bool fatal = false); /** @@ -287,11 +291,12 @@ bool LoadImage(const std::string& filename, } // namespace data } // namespace mlpack +#include "load_impl.hpp" // Include implementation of model-loading Load() overload. #include "load_model_impl.hpp" // Include implementation of Load() for vectors. #include "load_vec_impl.hpp" // Include implementation of Load() for images. #include "load_image_impl.hpp" - +// Include implementation of Load() for matrix. #endif diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 16153053791..25286aa2d9b 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -87,7 +87,8 @@ bool inline inplace_transpose(MatType& X, bool fatal) } } -template +template::value>::type> bool Load(const std::string& filename, MatType& matrix, const bool fatal, @@ -301,9 +302,10 @@ bool Load(const std::string& filename, } // For loading data into sparse matrix -template +template ::value>::type> bool Load(const std::string& filename, - arma::SpMat& matrix, + MatType& matrix, const bool fatal, const bool transpose) { diff --git a/src/mlpack/core/data/load_vec_impl.hpp b/src/mlpack/core/data/load_vec_impl.hpp index c0549f486fa..ed4bff1ad4e 100644 --- a/src/mlpack/core/data/load_vec_impl.hpp +++ b/src/mlpack/core/data/load_vec_impl.hpp @@ -20,13 +20,14 @@ namespace mlpack { namespace data { // Load column vector. -template +template::value>::type>> bool Load(const std::string& filename, - arma::Col& vec, + MatType& vec, const bool fatal) { // First load into auxiliary matrix. - arma::Mat tmp; + arma::Mat tmp; bool success = Load(filename, tmp, fatal, false); if (!success) { @@ -70,7 +71,7 @@ bool Load(const std::string& filename, * Now we can call the move operator, but it has to be the move operator * for Mat, not for Col. This will avoid copying the data. */ - *((arma::Mat*) &vec) = std::move(tmp); + *((arma::Mat*) &vec) = std::move(tmp); return true; } } @@ -78,18 +79,19 @@ bool Load(const std::string& filename, { // It's loaded as a column vector. We can call the move constructor // directly. - *((arma::Mat*) &vec) = std::move(tmp); + *((arma::Mat*) &vec) = std::move(tmp); return true; } } // Load row vector. -template +template::value>::type> bool Load(const std::string& filename, - arma::Row& rowvec, + MatType& rowvec, const bool fatal) { - arma::Mat tmp; + arma::Mat tmp; bool success = Load(filename, tmp, fatal, false); if (!success) { @@ -132,14 +134,14 @@ bool Load(const std::string& filename, * Now we can call the move operator, but it has to be the move operator * for Mat, not for Col. This will avoid copying the data. */ - *((arma::Mat*) &rowvec) = std::move(tmp); + *((arma::Mat*) &rowvec) = std::move(tmp); return true; } } else { // It's loaded as a row vector. We can call the move constructor directly. - *((arma::Mat*) &rowvec) = std::move(tmp); + *((arma::Mat*) &rowvec) = std::move(tmp); return true; } } From 195c31aeabffe5dd66bd85854321074893e1a41e Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 24 Jun 2021 16:07:03 +0530 Subject: [PATCH 029/112] Applied SFINAE in load() --- src/mlpack/core/data/load.hpp | 8 ++++---- src/mlpack/core/data/load_impl.hpp | 4 ++-- src/mlpack/core/data/load_vec_impl.hpp | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 23ee5fdd6c7..e1bf6c5a3f4 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -68,7 +68,7 @@ namespace data /** Functions to load and save matrices and models. */ { * @return Boolean value indicating success or failure of load. */ template::value>::type> + typename std::enable_if>::value>::type> bool Load(const std::string& filename, MatType& matrix, const bool fatal = false, @@ -105,7 +105,7 @@ bool Load(const std::string& filename, * @return Boolean value indicating success or failure of load. */ template::value>::type> + typename std::enable_if>::value>::type> bool Load(const std::string& filename, MatType& matrix, const bool fatal = false, @@ -139,7 +139,7 @@ bool Load(const std::string& filename, * @return Boolean value indicating success or failure of load. */ template::value>::type> + typename std::enable_if>::value>::type> bool Load(const std::string& filename, MatType& vec, const bool fatal = false); @@ -172,7 +172,7 @@ bool Load(const std::string& filename, * @return Boolean value indicating success or failure of load. */ template::value>::type> + typename std::enable_if>::value>::type> bool Load(const std::string& filename, MatType& rowvec, const bool fatal = false); diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 25286aa2d9b..adbe59e3751 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -88,7 +88,7 @@ bool inline inplace_transpose(MatType& X, bool fatal) } template::value>::type> + typename std::enable_if>::value>::type> bool Load(const std::string& filename, MatType& matrix, const bool fatal, @@ -303,7 +303,7 @@ bool Load(const std::string& filename, // For loading data into sparse matrix template ::value>::type> + typename std::enable_if>::value>::type> bool Load(const std::string& filename, MatType& matrix, const bool fatal, diff --git a/src/mlpack/core/data/load_vec_impl.hpp b/src/mlpack/core/data/load_vec_impl.hpp index ed4bff1ad4e..9d5b5e7dba1 100644 --- a/src/mlpack/core/data/load_vec_impl.hpp +++ b/src/mlpack/core/data/load_vec_impl.hpp @@ -21,7 +21,7 @@ namespace data { // Load column vector. template::value>::type>> + typename std::enable_if>::value>::type> bool Load(const std::string& filename, MatType& vec, const bool fatal) @@ -86,7 +86,7 @@ bool Load(const std::string& filename, // Load row vector. template::value>::type> + typename std::enable_if>::value>::type> bool Load(const std::string& filename, MatType& rowvec, const bool fatal) From c2290fa7629b6ffe1696eb3284d4ddce96858df2 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 25 Jun 2021 18:58:59 +0530 Subject: [PATCH 030/112] trying SFINAE --- src/mlpack/core/data/load.hpp | 22 ++++++++++------------ src/mlpack/core/data/load_impl.hpp | 14 +++++++------- src/mlpack/core/data/load_vec_impl.hpp | 22 ++++++++++------------ 3 files changed, 27 insertions(+), 31 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index e1bf6c5a3f4..cf057fa0b6f 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -67,13 +67,14 @@ namespace data /** Functions to load and save matrices and models. */ { * @param inputLoadType Used to determine the type of file to load (default arma::auto_detect). * @return Boolean value indicating success or failure of load. */ -template>::value>::type> +template bool Load(const std::string& filename, MatType& matrix, const bool fatal = false, const bool transpose = true, - const file_type inputLoadType = file_type::AutoDetect); + const file_type inputLoadType = file_type::AutoDetect, + const typename std::enable_if>::value>::type* = 0, + const typename std::enable_if>::value>::type* = 0); /** * Loads a sparse matrix from file, using arma::coord_ascii format. This @@ -104,10 +105,9 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading (default true). * @return Boolean value indicating success or failure of load. */ -template>::value>::type> +template bool Load(const std::string& filename, - MatType& matrix, + arma::sp_mat& matrix, const bool fatal = false, const bool transpose = true); @@ -138,10 +138,9 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template>::value>::type> +template bool Load(const std::string& filename, - MatType& vec, + arma::Col& vec, const bool fatal = false); /** @@ -171,10 +170,9 @@ bool Load(const std::string& filename, * @param fatal If an error should be reported as fatal (default false). * @return Boolean value indicating success or failure of load. */ -template>::value>::type> +template bool Load(const std::string& filename, - MatType& rowvec, + arma::Row& rowvec, const bool fatal = false); /** diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index adbe59e3751..d39fd1c9b9c 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -87,8 +87,8 @@ bool inline inplace_transpose(MatType& X, bool fatal) } } -template>::value>::type> +template + // typename std::enable_if::value>::type> bool Load(const std::string& filename, MatType& matrix, const bool fatal, @@ -240,8 +240,8 @@ bool Load(const std::string& filename, Log::Info << "Loading '" << filename << "' as CSV dataset. " << std::flush; try { - LoadCSV loader(filename); - loader.Load(matrix, info, transpose); + // LoadCSV loader(filename); + // loader.Load(matrix, info, transpose); } catch (std::exception& e) { @@ -302,10 +302,10 @@ bool Load(const std::string& filename, } // For loading data into sparse matrix -template >::value>::type> +template + // typename std::enable_if::value>::type> bool Load(const std::string& filename, - MatType& matrix, + arma::sp_mat& matrix, const bool fatal, const bool transpose) { diff --git a/src/mlpack/core/data/load_vec_impl.hpp b/src/mlpack/core/data/load_vec_impl.hpp index 9d5b5e7dba1..c0549f486fa 100644 --- a/src/mlpack/core/data/load_vec_impl.hpp +++ b/src/mlpack/core/data/load_vec_impl.hpp @@ -20,14 +20,13 @@ namespace mlpack { namespace data { // Load column vector. -template>::value>::type> +template bool Load(const std::string& filename, - MatType& vec, + arma::Col& vec, const bool fatal) { // First load into auxiliary matrix. - arma::Mat tmp; + arma::Mat tmp; bool success = Load(filename, tmp, fatal, false); if (!success) { @@ -71,7 +70,7 @@ bool Load(const std::string& filename, * Now we can call the move operator, but it has to be the move operator * for Mat, not for Col. This will avoid copying the data. */ - *((arma::Mat*) &vec) = std::move(tmp); + *((arma::Mat*) &vec) = std::move(tmp); return true; } } @@ -79,19 +78,18 @@ bool Load(const std::string& filename, { // It's loaded as a column vector. We can call the move constructor // directly. - *((arma::Mat*) &vec) = std::move(tmp); + *((arma::Mat*) &vec) = std::move(tmp); return true; } } // Load row vector. -template>::value>::type> +template bool Load(const std::string& filename, - MatType& rowvec, + arma::Row& rowvec, const bool fatal) { - arma::Mat tmp; + arma::Mat tmp; bool success = Load(filename, tmp, fatal, false); if (!success) { @@ -134,14 +132,14 @@ bool Load(const std::string& filename, * Now we can call the move operator, but it has to be the move operator * for Mat, not for Col. This will avoid copying the data. */ - *((arma::Mat*) &rowvec) = std::move(tmp); + *((arma::Mat*) &rowvec) = std::move(tmp); return true; } } else { // It's loaded as a row vector. We can call the move constructor directly. - *((arma::Mat*) &rowvec) = std::move(tmp); + *((arma::Mat*) &rowvec) = std::move(tmp); return true; } } From 409d7ff59e28913feb1b130b1f0457b4264d779c Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 30 Jun 2021 01:44:45 +0530 Subject: [PATCH 031/112] Created parser class --- src/mlpack/core/data/csv_parser.hpp | 5 +++++ src/mlpack/core/data/csv_parser_impl.hpp | 4 ++-- src/mlpack/core/data/load.hpp | 4 +--- src/mlpack/core/data/load_impl.hpp | 7 +++---- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index 360f3845aea..9ed23f66133 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -40,6 +40,9 @@ namespace mlpack{ namespace data{ +class Parser +{ + public: /** * Convert the given string token to assigned datatype and assign * this value to the given address. The address here will be a @@ -72,6 +75,8 @@ bool ConvertToken(typename MatType::elem_type& val, const std::string& token); template bool LoadCSVFile(MatType& x, std::fstream& f); +}; + } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index e86badaf439..18b5ba8f9f7 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -47,7 +47,7 @@ namespace data * example calling: convert_token(x.at(row, col), token) */ template - bool ConvertToken(typename MatType::elem_type& val, const std::string& token) + bool Parser::ConvertToken(typename MatType::elem_type& val, const std::string& token) { const size_t N = size_t(token.length()); @@ -123,7 +123,7 @@ namespace data * Parses the file and loads the data into the given matrix. */ template - bool LoadCSVFile(MatType& x, std::fstream& f) + bool Parser::LoadCSVFile(MatType& x, std::fstream& f) { bool load_okay = f.good(); diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index cf057fa0b6f..fb0c365b1e8 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -72,9 +72,7 @@ bool Load(const std::string& filename, MatType& matrix, const bool fatal = false, const bool transpose = true, - const file_type inputLoadType = file_type::AutoDetect, - const typename std::enable_if>::value>::type* = 0, - const typename std::enable_if>::value>::type* = 0); + const file_type inputLoadType = file_type::AutoDetect); /** * Loads a sparse matrix from file, using arma::coord_ascii format. This diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index d39fd1c9b9c..a5b59a947f0 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -88,7 +88,6 @@ bool inline inplace_transpose(MatType& X, bool fatal) } template - // typename std::enable_if::value>::type> bool Load(const std::string& filename, MatType& matrix, const bool fatal, @@ -167,11 +166,12 @@ bool Load(const std::string& filename, // We can't use the stream if the type is HDF5. bool success; - + Parser parser; + if (loadType != file_type::HDF5Binary) { if(loadType == file_type::CSVASCII) - success = LoadCSVFile(matrix, stream); + success = parser.LoadCSVFile(matrix, stream); else success = matrix.load(stream, ToArmaFileType(loadType)); } @@ -303,7 +303,6 @@ bool Load(const std::string& filename, // For loading data into sparse matrix template - // typename std::enable_if::value>::type> bool Load(const std::string& filename, arma::sp_mat& matrix, const bool fatal, From 3f7f769f8c3e9a0ed0165e7ab8d409e57508ba66 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 30 Jun 2021 02:43:54 +0530 Subject: [PATCH 032/112] Added GetMatSize --- src/mlpack/core/data/csv_parser.hpp | 1 + src/mlpack/core/data/csv_parser_impl.hpp | 33 +++++++++++++++++++----- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index 9ed23f66133..06ba9d85abb 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -75,6 +75,7 @@ bool ConvertToken(typename MatType::elem_type& val, const std::string& token); template bool LoadCSVFile(MatType& x, std::fstream& f); +inline std::pair GetMatSize(std::fstream& f); }; } // namespace data diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index 18b5ba8f9f7..6478f02329e 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -118,13 +118,9 @@ namespace data return true; } - /** - * Returns a bool value showing whether data was loaded successfully or not. - * Parses the file and loads the data into the given matrix. - */ - template - bool Parser::LoadCSVFile(MatType& x, std::fstream& f) + inline std::pair Parser::GetMatSize(std::fstream& f) { + bool load_okay = f.good(); f.clear(); @@ -167,10 +163,33 @@ namespace data f.clear(); f.seekg(pos1); - x.set_size(f_n_rows, f_n_cols); + //x.set_size(f_n_rows, f_n_cols); + std::pair mat_size(f_n_rows, f_n_cols); + + return mat_size; + } + + /** + * Returns a bool value showing whether data was loaded successfully or not. + * Parses the file and loads the data into the given matrix. + */ + template + bool Parser::LoadCSVFile(MatType& x, std::fstream& f) + { + bool load_okay = f.good(); + + f.clear(); + + std::pair mat_size = GetMatSize(f); + + x.set_size(mat_size.first, mat_size.second); size_t row = 0; + std::string line_string; + std::stringstream line_stream; + std::string token; + while (f.good()) { std::getline(f, line_string); From 41f2ef98d38e00c7967e588a9d5c55e463ec0637 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 30 Jun 2021 19:47:06 +0530 Subject: [PATCH 033/112] test --- src/mlpack/core/data/csv_parser.hpp | 4 +-- src/mlpack/core/data/csv_parser_impl.hpp | 4 +-- src/mlpack/core/data/load_csv.hpp | 42 ++++++++++++++---------- src/mlpack/core/data/load_impl.hpp | 8 +++-- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index 06ba9d85abb..129353bf784 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -73,9 +73,9 @@ bool ConvertToken(typename MatType::elem_type& val, const std::string& token); * @param f File stream to access the data file */ template -bool LoadCSVFile(MatType& x, std::fstream& f); +bool LoadCSVFile(MatType& x, std::ifstream& f); -inline std::pair GetMatSize(std::fstream& f); +inline std::pair GetMatSize(std::ifstream& f); }; } // namespace data diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/csv_parser_impl.hpp index 6478f02329e..62852bc76c7 100644 --- a/src/mlpack/core/data/csv_parser_impl.hpp +++ b/src/mlpack/core/data/csv_parser_impl.hpp @@ -118,7 +118,7 @@ namespace data return true; } - inline std::pair Parser::GetMatSize(std::fstream& f) + inline std::pair Parser::GetMatSize(std::ifstream& f) { bool load_okay = f.good(); @@ -174,7 +174,7 @@ namespace data * Parses the file and loads the data into the given matrix. */ template - bool Parser::LoadCSVFile(MatType& x, std::fstream& f) + bool Parser::LoadCSVFile(MatType& x, std::ifstream& f) { bool load_okay = f.good(); diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index e6e6569ea4c..85b24b2146b 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -91,48 +91,56 @@ class LoadCSV cols = 0; // First, count the number of rows in the file (this is the dimensionality). - std::string line; - while (std::getline(inFile, line)) - { - ++rows; - } - info = DatasetMapper(rows); + // std::string line; + // while (std::getline(inFile, line)) + // { + // ++rows; + // } + + Parser parser; + + std::pair matSize = parser.GetMatSize(inFile); + + info = DatasetMapper(matSize.first); // Now, jump back to the beginning of the file. inFile.clear(); inFile.seekg(0, std::ios::beg); rows = 0; + std::string line; + while (std::getline(inFile, line)) { ++rows; // Remove whitespace from either side. boost::trim(line); - if (rows == 1) + /*if (rows == 1) { // Extract the number of columns. auto findColSize = [&cols](iter_type) { ++cols; }; qi::parse(line.begin(), line.end(), stringRule[findColSize] % delimiterRule); - } + }*/ // I guess this is technically a second pass, but that's ok... still the // same idea... if (MapPolicy::NeedsFirstPass) { + info.template MapFirstPass(std::move(line), rows-1); // In this case we must pass everything we parse to the MapPolicy. - auto firstPassMap = [&](const iter_type& iter) - { - std::string str(iter.begin(), iter.end()); - boost::trim(str); + // auto firstPassMap = [&](const iter_type& iter) + // { + // std::string str(iter.begin(), iter.end()); + // boost::trim(str); - info.template MapFirstPass(std::move(str), rows - 1); - }; + // info.template MapFirstPass(std::move(str), rows - 1); + // }; - // Now parse the line. - qi::parse(line.begin(), line.end(), - stringRule[firstPassMap] % delimiterRule); + // // Now parse the line. + // qi::parse(line.begin(), line.end(), + // stringRule[firstPassMap] % delimiterRule); } } } diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index a5b59a947f0..61d229f43c0 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -98,6 +98,8 @@ bool Load(const std::string& filename, // Catch nonexistent files by opening the stream ourselves. std::fstream stream; + std::ifstream istream(filename); + #ifdef _WIN32 // Always open in binary mode on Windows. stream.open(filename.c_str(), std::fstream::in | std::fstream::binary); #else @@ -171,7 +173,7 @@ bool Load(const std::string& filename, if (loadType != file_type::HDF5Binary) { if(loadType == file_type::CSVASCII) - success = parser.LoadCSVFile(matrix, stream); + success = parser.LoadCSVFile(matrix, istream); else success = matrix.load(stream, ToArmaFileType(loadType)); } @@ -240,8 +242,8 @@ bool Load(const std::string& filename, Log::Info << "Loading '" << filename << "' as CSV dataset. " << std::flush; try { - // LoadCSV loader(filename); - // loader.Load(matrix, info, transpose); + LoadCSV loader(filename); + loader.Load(matrix, info, transpose); } catch (std::exception& e) { From ae25a2635c4bea2c145f795933f90e017f249ca4 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 1 Jul 2021 16:32:37 +0530 Subject: [PATCH 034/112] Still not working --- src/mlpack/core/data/load.hpp | 2 +- src/mlpack/core/data/load_impl.hpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index fb0c365b1e8..75717c768d2 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -18,10 +18,10 @@ #include #include +#include "csv_parser.hpp" #include "format.hpp" #include "dataset_mapper.hpp" #include "image_info.hpp" -#include "csv_parser.hpp" namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 61d229f43c0..819d9bef20c 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -22,7 +22,6 @@ #include "load.hpp" #include "extension.hpp" #include "detect_file_type.hpp" -#include "csv_parser.hpp" #include "types.hpp" #include From e2131127792d59d45cdae794e4847fefd67d9d7d Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sat, 3 Jul 2021 13:24:29 +0200 Subject: [PATCH 035/112] Add these files to get rid of them Signed-off-by: Omar Shrit --- src/mlpack/core/data/csv_parser.hpp | 64 +------------------ ...{csv_parser_impl.hpp => load_csv_impl.hpp} | 0 2 files changed, 3 insertions(+), 61 deletions(-) rename src/mlpack/core/data/{csv_parser_impl.hpp => load_csv_impl.hpp} (100%) diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp index 129353bf784..f0c9eb35ae2 100644 --- a/src/mlpack/core/data/csv_parser.hpp +++ b/src/mlpack/core/data/csv_parser.hpp @@ -1,32 +1,5 @@ /** * @file core/data/csv_parser.hpp - * @author Conrad Sanderson - * @author Gopi M. Tatiraju - * - * This csv parser is designed by taking reference from armadillo's csv parser. - * In this mlpack's version, all the arma dependencies were removed or replaced - * accordingly, making the parser totally independent of armadillo. - * - * This parser will be totally independent to any linear algebra library. - * This can be used to load data into any matrix, i.e. arma and bandicoot - * in future. - * - * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp - * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) - * Copyright 2008-2016 National ICT Australia (NICTA) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ------------------------------------------------------------------------ - * * mlpack is free software; you may redistribute it and/or modify it under the * terms of the 3-clause BSD license. You should have received a copy of the * 3-clause BSD license along with mlpack. If not, see @@ -35,47 +8,16 @@ #ifndef MLPACK_CORE_DATA_CSV_PARSER_HPP #define MLPACK_CORE_DATA_CSV_PARSER_HPP -#include "types.hpp" namespace mlpack{ namespace data{ class Parser -{ - public: -/** - * Convert the given string token to assigned datatype and assign - * this value to the given address. The address here will be a - * matrix location. - * - * Token is always read as a string, if the given token is +/-INF or NAN - * it converts them to infinity and NAN using numeric_limits. - * - * @param val Token's value will be assigned to this address - * @param token Value which should be assigned - */ -template -bool ConvertToken(typename MatType::elem_type& val, const std::string& token); +{ + public: + -/** - * Returns a bool value showing whether data was loaded successfully or not. - * - * Parses a csv file and loads the data into a given matrix. In the first pass, - * the function will determine the number of cols and rows in the given file. - * Once the rows and cols are fixed we initialize the matrix with zeros. In - * the second pass, the function converts each value to required datatype - * and sets it equal to val. - * - * This function uses MatType as template parameter in order to provide - * support for any type of matrices from any linear algebra library. - * - * @param x Matrix in which data will be loaded - * @param f File stream to access the data file - */ -template -bool LoadCSVFile(MatType& x, std::ifstream& f); -inline std::pair GetMatSize(std::ifstream& f); }; } // namespace data diff --git a/src/mlpack/core/data/csv_parser_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp similarity index 100% rename from src/mlpack/core/data/csv_parser_impl.hpp rename to src/mlpack/core/data/load_csv_impl.hpp From 8f4ee4256eb73c027e29b04c4a4a0384d2aa5186 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sat, 3 Jul 2021 13:25:25 +0200 Subject: [PATCH 036/112] Remove csv file Signed-off-by: Omar Shrit --- src/mlpack/core/data/csv_parser.hpp | 28 ---------------------------- 1 file changed, 28 deletions(-) delete mode 100644 src/mlpack/core/data/csv_parser.hpp diff --git a/src/mlpack/core/data/csv_parser.hpp b/src/mlpack/core/data/csv_parser.hpp deleted file mode 100644 index f0c9eb35ae2..00000000000 --- a/src/mlpack/core/data/csv_parser.hpp +++ /dev/null @@ -1,28 +0,0 @@ -/** - * @file core/data/csv_parser.hpp - * mlpack is free software; you may redistribute it and/or modify it under the - * terms of the 3-clause BSD license. You should have received a copy of the - * 3-clause BSD license along with mlpack. If not, see - * http://www.opensource.org/licenses/BSD-3-Clause for more information. - */ -#ifndef MLPACK_CORE_DATA_CSV_PARSER_HPP -#define MLPACK_CORE_DATA_CSV_PARSER_HPP - - -namespace mlpack{ -namespace data{ - -class Parser -{ - public: - - - -}; - -} // namespace data -} // namespace mlpack - -#include "csv_parser_impl.hpp" - -#endif From f9f7c8244922b02365567ddbfb286f73830e6707 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sat, 3 Jul 2021 14:24:13 +0200 Subject: [PATCH 037/112] Remove mlpack/core.hpp header from load_csv The bug was related to core.hpp as it was included in load_csv. There is no reason to include core.hpp here. Signed-off-by: Omar Shrit --- src/mlpack/core/data/load_csv.cpp | 5 ++ src/mlpack/core/data/load_csv.hpp | 81 +++++++++++++++++++++++++++---- 2 files changed, 77 insertions(+), 9 deletions(-) diff --git a/src/mlpack/core/data/load_csv.cpp b/src/mlpack/core/data/load_csv.cpp index 5a150088232..d95843dae1f 100644 --- a/src/mlpack/core/data/load_csv.cpp +++ b/src/mlpack/core/data/load_csv.cpp @@ -17,6 +17,11 @@ using namespace boost::spirit; namespace mlpack { namespace data { +LoadCSV::LoadCSV() +{ + // Nothing to do here. +} + LoadCSV::LoadCSV(const std::string& file) : extension(Extension(file)), filename(file), diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 85b24b2146b..e81d03571ce 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -1,8 +1,32 @@ /** * @file core/data/load_csv.hpp * @author ThamNgapWei + * @author Conrad Sanderson + * @author Gopi M. Tatiraju * - * This is a csv parsers which use to parse the csv file format + * This csv parser is designed by taking reference from armadillo's csv parser. + * In this mlpack's version, all the arma dependencies were removed or replaced + * accordingly, making the parser totally independent of armadillo. + * + * This parser will be totally independent to any linear algebra library. + * This can be used to load data into any matrix, i.e. arma and bandicoot + * in future. + * + * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp + * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) + * Copyright 2008-2016 National ICT Australia (NICTA) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ------------------------------------------------------------------------ * * mlpack is free software; you may redistribute it and/or modify it under the * terms of the 3-clause BSD license. You should have received a copy of the @@ -15,7 +39,6 @@ #include #include -#include #include #include @@ -24,24 +47,60 @@ #include "extension.hpp" #include "format.hpp" #include "dataset_mapper.hpp" +#include "types.hpp" namespace mlpack { namespace data { /** - *Load the csv file.This class use boost::spirit - *to implement the parser, please refer to following link - *http://theboostcpplibraries.com/boost.spirit for quick review. + * Load the csv file.This class use boost::spirit + * to implement the parser, please refer to following link + * http://theboostcpplibraries.com/boost.spirit for quick review. */ class LoadCSV { public: + + // Do nothing, just a place holder, to be removed later. + LoadCSV(); /** * Construct the LoadCSV object on the given file. This will construct the * rules necessary for loading and attempt to open the file. */ LoadCSV(const std::string& file); + /** + * Convert the given string token to assigned datatype and assign + * this value to the given address. The address here will be a + * matrix location. + * + * Token is always read as a string, if the given token is +/-INF or NAN + * it converts them to infinity and NAN using numeric_limits. + * + * @param val Token's value will be assigned to this address + * @param token Value which should be assigned + */ + template + bool ConvertToken(typename MatType::elem_type& val, const std::string& token); + + /** + * Returns a bool value showing whether data was loaded successfully or not. + * + * Parses a csv file and loads the data into a given matrix. In the first pass, + * the function will determine the number of cols and rows in the given file. + * Once the rows and cols are fixed we initialize the matrix with zeros. In + * the second pass, the function converts each value to required datatype + * and sets it equal to val. + * + * This function uses MatType as template parameter in order to provide + * support for any type of matrices from any linear algebra library. + * + * @param x Matrix in which data will be loaded + * @param f File stream to access the data file + */ + template + bool LoadCSVFile(MatType& x, std::ifstream& f); + /** * Load the file into the given matrix with the given DatasetMapper object. * Throws exceptions on errors. @@ -97,11 +156,11 @@ class LoadCSV // ++rows; // } - Parser parser; + // Parser parser; - std::pair matSize = parser.GetMatSize(inFile); + // std::pair matSize = parser.GetMatSize(inFile); - info = DatasetMapper(matSize.first); + // info = DatasetMapper(matSize.first); // Now, jump back to the beginning of the file. inFile.clear(); @@ -128,7 +187,7 @@ class LoadCSV // same idea... if (MapPolicy::NeedsFirstPass) { - info.template MapFirstPass(std::move(line), rows-1); + // info.template MapFirstPass(std::move(line), rows-1); // In this case we must pass everything we parse to the MapPolicy. // auto firstPassMap = [&](const iter_type& iter) // { @@ -365,6 +424,8 @@ class LoadCSV } } + inline std::pair GetMatSize(std::ifstream& f); + //! Spirit rule for parsing. boost::spirit::qi::rule stringRule; //! Spirit rule for delimiters (i.e. ',' for CSVs). @@ -381,4 +442,6 @@ class LoadCSV } // namespace data } // namespace mlpack +#include "load_csv_impl.hpp" + #endif From 7eac0dcf27da039f83c4a2e52c1490496095ff59 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sat, 3 Jul 2021 14:25:29 +0200 Subject: [PATCH 038/112] Convert csv_parser to load_csv_impl Signed-off-by: Omar Shrit --- src/mlpack/core/data/load_csv_impl.hpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp index 62852bc76c7..247442b8f70 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_csv_impl.hpp @@ -1,5 +1,6 @@ /** - * @file core/data/csv_parser_impl.hpp + * @file core/data/load_csv_impl.hpp + * * @author Conrad Sanderson * @author Gopi M. Tatiraju * @@ -35,7 +36,7 @@ #ifndef MLPACK_CORE_DATA_CSV_PARSER_IMPL_HPP #define MLPACK_CORE_DATA_CSV_PARSER_IMPL_HPP -#include "csv_parser.hpp" +#include "load_csv.hpp" namespace mlpack { @@ -47,7 +48,7 @@ namespace data * example calling: convert_token(x.at(row, col), token) */ template - bool Parser::ConvertToken(typename MatType::elem_type& val, const std::string& token) + bool LoadCSV::ConvertToken(typename MatType::elem_type& val, const std::string& token) { const size_t N = size_t(token.length()); @@ -72,15 +73,15 @@ namespace data if (((sig_a == 'i') || (sig_a == 'I')) && ((sig_b == 'n') || (sig_b == 'N')) && - ((sig_c == 'f') || (sig_c == 'F'))) + ((sig_c == 'f') || (sig_c == 'F'))) { val = neg ? -(std::numeric_limits::infinity()) : - std::numeric_limits::infinity(); + std::numeric_limits::infinity(); return true; } else if (((sig_a == 'n') || (sig_a == 'N')) && - ((sig_b == 'a') || (sig_b == 'A')) && - ((sig_c == 'n') || (sig_c == 'N'))) + ((sig_b == 'a') || (sig_b == 'A')) && + ((sig_c == 'n') || (sig_c == 'N'))) { val = std::numeric_limits::quiet_NaN(); return true; @@ -118,7 +119,7 @@ namespace data return true; } - inline std::pair Parser::GetMatSize(std::ifstream& f) + inline std::pair LoadCSV::GetMatSize(std::ifstream& f) { bool load_okay = f.good(); @@ -174,7 +175,7 @@ namespace data * Parses the file and loads the data into the given matrix. */ template - bool Parser::LoadCSVFile(MatType& x, std::ifstream& f) + bool LoadCSV::LoadCSVFile(MatType& x, std::ifstream& f) { bool load_okay = f.good(); From 0c937e1914108bea0712b05011feda4470c25f56 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sat, 3 Jul 2021 14:28:14 +0200 Subject: [PATCH 039/112] Clean headers Signed-off-by: Omar Shrit --- src/mlpack/core/data/detect_file_type.cpp | 2 +- src/mlpack/core/data/detect_file_type.hpp | 2 +- src/mlpack/core/data/load.hpp | 7 +++++-- src/mlpack/core/data/load_impl.hpp | 9 +++------ src/mlpack/core/data/save.hpp | 3 +-- src/mlpack/core/data/save_impl.hpp | 1 - 6 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/mlpack/core/data/detect_file_type.cpp b/src/mlpack/core/data/detect_file_type.cpp index 129f29b7130..9e2eee79e5d 100644 --- a/src/mlpack/core/data/detect_file_type.cpp +++ b/src/mlpack/core/data/detect_file_type.cpp @@ -38,7 +38,7 @@ std::string GetStringType(const file_type& type) case file_type::ArmaBinary: return "Armadillo binary formatted data"; case file_type::PGMBinary: return "PGM data"; case file_type::HDF5Binary: return "HDF5 data"; - default: return ""; + default: return ""; } } diff --git a/src/mlpack/core/data/detect_file_type.hpp b/src/mlpack/core/data/detect_file_type.hpp index 6cf2ca124d3..9394748a292 100644 --- a/src/mlpack/core/data/detect_file_type.hpp +++ b/src/mlpack/core/data/detect_file_type.hpp @@ -15,7 +15,7 @@ #ifndef MLPACK_CORE_DATA_DETECT_FILE_TYPE_HPP #define MLPACK_CORE_DATA_DETECT_FILE_TYPE_HPP -#include "csv_parser.hpp" +#include "types.hpp" namespace mlpack { namespace data { diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 75717c768d2..c8b7a339ce8 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -18,10 +18,12 @@ #include #include -#include "csv_parser.hpp" #include "format.hpp" #include "dataset_mapper.hpp" #include "image_info.hpp" +#include "load_csv.hpp" +#include "load_arff.hpp" +#include "detect_file_type.hpp" namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { @@ -287,6 +289,7 @@ bool LoadImage(const std::string& filename, } // namespace data } // namespace mlpack +// Include implementation of Load() for matrix. #include "load_impl.hpp" // Include implementation of model-loading Load() overload. #include "load_model_impl.hpp" @@ -294,5 +297,5 @@ bool LoadImage(const std::string& filename, #include "load_vec_impl.hpp" // Include implementation of Load() for images. #include "load_image_impl.hpp" -// Include implementation of Load() for matrix. + #endif diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 819d9bef20c..7be7abbedb1 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -13,13 +13,12 @@ #define MLPACK_CORE_DATA_LOAD_IMPL_HPP // In case it hasn't already been included. +#include "load.hpp" #include #include #include -#include "load_csv.hpp" -#include "load.hpp" #include "extension.hpp" #include "detect_file_type.hpp" #include "types.hpp" @@ -28,8 +27,6 @@ #include #include -#include "load_arff.hpp" - namespace mlpack { namespace data { @@ -167,12 +164,12 @@ bool Load(const std::string& filename, // We can't use the stream if the type is HDF5. bool success; - Parser parser; + LoadCSV load; if (loadType != file_type::HDF5Binary) { if(loadType == file_type::CSVASCII) - success = parser.LoadCSVFile(matrix, istream); + success = load.LoadCSVFile(matrix, istream); else success = matrix.load(stream, ToArmaFileType(loadType)); } diff --git a/src/mlpack/core/data/save.hpp b/src/mlpack/core/data/save.hpp index 84cc1b22c00..5e2c516f179 100644 --- a/src/mlpack/core/data/save.hpp +++ b/src/mlpack/core/data/save.hpp @@ -20,8 +20,7 @@ #include "format.hpp" #include "image_info.hpp" -#include "csv_parser.hpp" -#include "types.hpp" +#include "detect_file_type.hpp" namespace mlpack { namespace data /** Functions to load and save matrices. */ { diff --git a/src/mlpack/core/data/save_impl.hpp b/src/mlpack/core/data/save_impl.hpp index cdbf45c509a..ce8db3874d5 100644 --- a/src/mlpack/core/data/save_impl.hpp +++ b/src/mlpack/core/data/save_impl.hpp @@ -15,7 +15,6 @@ // In case it hasn't already been included. #include "save.hpp" #include "extension.hpp" -#include "detect_file_type.hpp" #include #include From d4f79d4f4a0fb230af3856e88e7496b5091ed68d Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sat, 3 Jul 2021 14:28:30 +0200 Subject: [PATCH 040/112] Remove csv parser from CMakeLists Signed-off-by: Omar Shrit --- src/mlpack/core/data/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index 0f188703b65..d0c029a829b 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -33,8 +33,6 @@ set(SOURCES confusion_matrix.hpp one_hot_encoding.hpp one_hot_encoding_impl.hpp - csv_parser.hpp - csv_parser_impl.hpp types.hpp types_impl.hpp ) From ea95bb9784d7e7418f2ac818e1a9e59f8a9aa76e Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sat, 3 Jul 2021 14:29:02 +0200 Subject: [PATCH 041/112] Remove csv parser no reason for it Signed-off-by: Omar Shrit --- src/mlpack/core.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mlpack/core.hpp b/src/mlpack/core.hpp index e05dc9f2083..d29a1747246 100644 --- a/src/mlpack/core.hpp +++ b/src/mlpack/core.hpp @@ -90,7 +90,6 @@ #include #include #include -#include // mlpack::backtrace only for linux #ifdef HAS_BFD_DL From 94265453110c5020e243ab59ebfa6bdad050f1bf Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sat, 17 Jul 2021 05:31:51 +0530 Subject: [PATCH 042/112] Removed spirits from GetMatrixSize() and GetTransposeMatrixSize() --- src/mlpack/core/data/load_csv.hpp | 102 +++++++++++++------------ src/mlpack/core/data/load_csv_impl.hpp | 2 +- 2 files changed, 54 insertions(+), 50 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index e81d03571ce..37f25fc5fac 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -150,56 +150,61 @@ class LoadCSV cols = 0; // First, count the number of rows in the file (this is the dimensionality). - // std::string line; - // while (std::getline(inFile, line)) - // { - // ++rows; - // } - - // Parser parser; - - // std::pair matSize = parser.GetMatSize(inFile); + std::string line; + while (std::getline(inFile, line)) + { + ++rows; + } - // info = DatasetMapper(matSize.first); + // Reset the DatasetInfo object, if needed. + if (info.Dimensionality() == 0) + { + info.SetDimensionality(rows); + } + else if (info.Dimensionality() != rows) + { + std::ostringstream oss; + oss << "data::LoadCSV(): given DatasetInfo has dimensionality " + << info.Dimensionality() << ", but data has dimensionality " + << rows; + throw std::invalid_argument(oss.str()); + } // Now, jump back to the beginning of the file. inFile.clear(); inFile.seekg(0, std::ios::beg); rows = 0; - - std::string line; - + while (std::getline(inFile, line)) { ++rows; - // Remove whitespace from either side. - boost::trim(line); - /*if (rows == 1) + if (rows == 1) { // Extract the number of columns. - auto findColSize = [&cols](iter_type) { ++cols; }; - qi::parse(line.begin(), line.end(), - stringRule[findColSize] % delimiterRule); - }*/ + std::pair dimen = GetMatSize(inFile); + cols = dimen.second; + } // I guess this is technically a second pass, but that's ok... still the // same idea... if (MapPolicy::NeedsFirstPass) { - // info.template MapFirstPass(std::move(line), rows-1); // In this case we must pass everything we parse to the MapPolicy. - // auto firstPassMap = [&](const iter_type& iter) - // { - // std::string str(iter.begin(), iter.end()); - // boost::trim(str); - - // info.template MapFirstPass(std::move(str), rows - 1); - // }; + std::string str(line.begin(), line.end()); + + for(int i = 0; i < str.size(); i++) + { + // Maybe there is a faster way to parser each element of the string + // Also for now it is being considered that delimiter will always + // be comma(,) + if(str[i] != ',') + { + std::string cc(1, str[i]); + info.template MapFirstPass(std::move(cc), rows - 1); + } + } - // // Now parse the line. - // qi::parse(line.begin(), line.end(), - // stringRule[firstPassMap] % delimiterRule); } } } @@ -236,37 +241,36 @@ class LoadCSV while (std::getline(inFile, line)) { ++cols; - // Remove whitespace from either side. - boost::trim(line); if (cols == 1) { // Extract the number of dimensions. - auto findRowSize = [&rows](iter_type) { ++rows; }; - qi::parse(line.begin(), line.end(), - stringRule[findRowSize] % delimiterRule); + std::pair dimen = GetMatSize(inFile); + rows = dimen.second; - // Now that we know the dimensionality, initialize the DatasetMapper. - info.SetDimensionality(rows); + // Now that we know the dimensionality, initialize the DatasetMapper. + info.SetDimensionality(rows); } - // If we need to do a first pass for the DatasetMapper, do it. + // If we need to do a first pas12dds for the DatasetMapper, do it. if (MapPolicy::NeedsFirstPass) { size_t dim = 0; // In this case we must pass everything we parse to the MapPolicy. - auto firstPassMap = [&](const iter_type& iter) - { - std::string str(iter.begin(), iter.end()); - boost::trim(str); + std::string str(line.begin(), line.end()); - info.template MapFirstPass(std::move(str), dim++); - }; - - // Now parse the line. - qi::parse(line.begin(), line.end(), - stringRule[firstPassMap] % delimiterRule); + // Maybe there is a faster way to parser each element of the string + // Also for now it is being considered that delimiter will always + // be comma(,) + for(int i = 0; i < str.size(); i++) + { + if(str[i] != ',') + { + std::string cc(1, str[i]); + info.template MapFirstPass(std::move(cc), dim++); + } + } } } } diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp index 247442b8f70..bbc53fdba2c 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_csv_impl.hpp @@ -183,7 +183,7 @@ namespace data std::pair mat_size = GetMatSize(f); - x.set_size(mat_size.first, mat_size.second); + x.set_size(mat_size.second, mat_size.first); size_t row = 0; From 4402e5e2f2a27537dc5512b347ee0d3ff7e9ec51 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Mon, 19 Jul 2021 11:41:11 +0530 Subject: [PATCH 043/112] minor chages --- src/mlpack/core/data/load_csv.hpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 37f25fc5fac..616a49273bb 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -8,10 +8,6 @@ * In this mlpack's version, all the arma dependencies were removed or replaced * accordingly, making the parser totally independent of armadillo. * - * This parser will be totally independent to any linear algebra library. - * This can be used to load data into any matrix, i.e. arma and bandicoot - * in future. - * * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) * Copyright 2008-2016 National ICT Australia (NICTA) @@ -178,11 +174,10 @@ class LoadCSV while (std::getline(inFile, line)) { ++rows; - if (rows == 1) { // Extract the number of columns. - std::pair dimen = GetMatSize(inFile); + std::pair dimen = GetMatSize(inFile); cols = dimen.second; } @@ -252,7 +247,7 @@ class LoadCSV info.SetDimensionality(rows); } - // If we need to do a first pas12dds for the DatasetMapper, do it. + // If we need to do a first pass for the DatasetMapper, do it. if (MapPolicy::NeedsFirstPass) { size_t dim = 0; From fb251ab25ba6884a00c257a007cc538843fc4ae1 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 22 Jul 2021 22:38:34 +0530 Subject: [PATCH 044/112] Removing arma::file_type from load_save_test.cpp --- src/mlpack/core/data/types.hpp | 6 +++--- src/mlpack/core/data/types_impl.hpp | 3 +-- src/mlpack/tests/load_save_test.cpp | 8 ++++---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/mlpack/core/data/types.hpp b/src/mlpack/core/data/types.hpp index c4e09177180..a54e36a076d 100644 --- a/src/mlpack/core/data/types.hpp +++ b/src/mlpack/core/data/types.hpp @@ -43,13 +43,13 @@ enum struct file_type }; /** - * WHhere should I place this fucntion? * This fucntion is used to convert mlpack file type to respective * arma file type. * - * @param type Mlpack's file_type which will we converted to arma's file_type + * @param type mlpack::file_type */ -inline arma::file_type ToArmaFileType(file_type& type); +inline arma::file_type ToArmaFileType(const file_type& type); + } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/types_impl.hpp b/src/mlpack/core/data/types_impl.hpp index 2f779cdda22..7ebb8e1f897 100644 --- a/src/mlpack/core/data/types_impl.hpp +++ b/src/mlpack/core/data/types_impl.hpp @@ -26,7 +26,7 @@ namespace mlpack namespace data { -inline arma::file_type ToArmaFileType(file_type& type) +inline arma::file_type ToArmaFileType(const file_type& type) { switch(type) { @@ -82,5 +82,4 @@ inline arma::file_type ToArmaFileType(file_type& type) } // namespace data } // namespace mlpack - #endif diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp index 44e48e338e8..f5bf6d39f69 100644 --- a/src/mlpack/tests/load_save_test.cpp +++ b/src/mlpack/tests/load_save_test.cpp @@ -85,7 +85,7 @@ TEST_CASE("WrongExtensionCorrectLoad", "[LoadSaveTest]") // Now reload through our interface. REQUIRE( - data::Load("test_file.csv", test, false, true, arma::arma_binary) + data::Load("test_file.csv", test, false, true, file_type::ArmaBinary) == true); REQUIRE(test.n_rows == 4); @@ -269,7 +269,7 @@ TEST_CASE("LoadAnyExtensionFileTest", "[LoadSaveTest]") f.close(); arma::mat test; - REQUIRE(data::Load("test_file.blah", test, false, true, arma::raw_ascii)); + REQUIRE(data::Load("test_file.blah", test, false, true, file_type::RawASCII)); REQUIRE(test.n_rows == 4); REQUIRE(test.n_cols == 2); @@ -979,10 +979,10 @@ TEST_CASE("SaveArmaBinaryArbitraryExtensionTest", "[LoadSaveTest]") "4 8;"; REQUIRE(data::Save("test_file.blerp.blah", test, false, true, - arma::arma_binary) == true); + file_type::ArmaBinary) == true); REQUIRE(data::Load("test_file.blerp.blah", test, false, true, - arma::arma_binary) == true); + file_type::ArmaBinary) == true); REQUIRE(test.n_rows == 4); REQUIRE(test.n_cols == 2); From 722d1d2d476585940b0602b74a21a77ac8e8a548 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 22 Jul 2021 23:00:34 +0530 Subject: [PATCH 045/112] resolving issue --- src/mlpack/core/data/load_csv.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 02ecd543122..29b05b002e5 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -257,10 +257,8 @@ class LoadCSV std::pair dimen = GetMatSize(inFile); rows = dimen.second; -<<<<<<< HEAD // Now that we know the dimensionality, initialize the DatasetMapper. info.SetDimensionality(rows); -======= // Reset the DatasetInfo object, if needed. if (info.Dimensionality() == 0) { @@ -274,7 +272,6 @@ class LoadCSV << rows; throw std::invalid_argument(oss.str()); } ->>>>>>> 1fa385493a679809822f92151add8bc50a6f3260 } // If we need to do a first pass for the DatasetMapper, do it. From 64c4d2fd831190e9686fb2ecad6990e0e5750de0 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 22 Jul 2021 23:57:54 +0530 Subject: [PATCH 046/112] Running tests locally --- src/mlpack/tests/load_save_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp index 44e48e338e8..f5bf6d39f69 100644 --- a/src/mlpack/tests/load_save_test.cpp +++ b/src/mlpack/tests/load_save_test.cpp @@ -85,7 +85,7 @@ TEST_CASE("WrongExtensionCorrectLoad", "[LoadSaveTest]") // Now reload through our interface. REQUIRE( - data::Load("test_file.csv", test, false, true, arma::arma_binary) + data::Load("test_file.csv", test, false, true, file_type::ArmaBinary) == true); REQUIRE(test.n_rows == 4); @@ -269,7 +269,7 @@ TEST_CASE("LoadAnyExtensionFileTest", "[LoadSaveTest]") f.close(); arma::mat test; - REQUIRE(data::Load("test_file.blah", test, false, true, arma::raw_ascii)); + REQUIRE(data::Load("test_file.blah", test, false, true, file_type::RawASCII)); REQUIRE(test.n_rows == 4); REQUIRE(test.n_cols == 2); @@ -979,10 +979,10 @@ TEST_CASE("SaveArmaBinaryArbitraryExtensionTest", "[LoadSaveTest]") "4 8;"; REQUIRE(data::Save("test_file.blerp.blah", test, false, true, - arma::arma_binary) == true); + file_type::ArmaBinary) == true); REQUIRE(data::Load("test_file.blerp.blah", test, false, true, - arma::arma_binary) == true); + file_type::ArmaBinary) == true); REQUIRE(test.n_rows == 4); REQUIRE(test.n_cols == 2); From da06849787a03ea54a8bf49d22b6ca66f1e58211 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 23 Jul 2021 12:41:14 +0530 Subject: [PATCH 047/112] chaning int to size_t in ConvertToken, changing set_size to zeros --- src/mlpack/core/data/load_csv_impl.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp index bbc53fdba2c..994516f628c 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_csv_impl.hpp @@ -119,7 +119,7 @@ namespace data return true; } - inline std::pair LoadCSV::GetMatSize(std::ifstream& f) + inline std::pair LoadCSV::GetMatSize(std::ifstream& f) { bool load_okay = f.good(); @@ -164,8 +164,7 @@ namespace data f.clear(); f.seekg(pos1); - //x.set_size(f_n_rows, f_n_cols); - std::pair mat_size(f_n_rows, f_n_cols); + std::pair mat_size(f_n_rows, f_n_cols); return mat_size; } @@ -181,9 +180,9 @@ namespace data f.clear(); - std::pair mat_size = GetMatSize(f); + std::pair mat_size = GetMatSize(f); - x.set_size(mat_size.second, mat_size.first); + x.zeros(mat_size.second, mat_size.first); size_t row = 0; From 704a1203f8a7f39a5f37049a516c05a1c1cc30df Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 23 Jul 2021 12:51:21 +0530 Subject: [PATCH 048/112] missed these size_t changes --- src/mlpack/core/data/load_csv.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 29b05b002e5..ec5b1861368 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -191,7 +191,7 @@ class LoadCSV if (rows == 1) { // Extract the number of columns. - std::pair dimen = GetMatSize(inFile); + std::pair dimen = GetMatSize(inFile); cols = dimen.second; } @@ -450,7 +450,7 @@ class LoadCSV } } - inline std::pair GetMatSize(std::ifstream& f); + inline std::pair GetMatSize(std::ifstream& f); //! Spirit rule for parsing. boost::spirit::qi::rule stringRule; From aa3b189e21d28ac894dbfc6792e7ad06a054e709 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 23 Jul 2021 18:20:42 +0530 Subject: [PATCH 049/112] Adding cmakefile --- src/mlpack/tests/CMakeLists.txt | 344 ++++++++++++++-------------- src/mlpack/tests/load_save_test.cpp | 15 +- 2 files changed, 183 insertions(+), 176 deletions(-) diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index 33b10aa735d..4727fe3d3ba 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -3,180 +3,180 @@ include(CTest) # mlpack test executable. add_executable(mlpack_test EXCLUDE_FROM_ALL - activation_functions_test.cpp - adaboost_test.cpp - akfn_test.cpp - aknn_test.cpp - ann_dist_test.cpp - ann_layer_test.cpp - ann_regularizer_test.cpp - ann_test_tools.hpp - ann_visitor_test.cpp - armadillo_svd_test.cpp - arma_extend_test.cpp - async_learning_test.cpp - augmented_rnns_tasks_test.cpp - bayesian_linear_regression_test.cpp - bias_svd_test.cpp - binarize_test.cpp - block_krylov_svd_test.cpp - callback_test.cpp - cf_test.cpp - cli_binding_test.cpp - convolutional_network_test.cpp - convolution_test.cpp - cosine_tree_test.cpp - cv_test.cpp - dbscan_test.cpp - dcgan_test.cpp - decision_tree_regressor_test.cpp - decision_tree_test.cpp - det_test.cpp - distribution_test.cpp - drusilla_select_test.cpp - emst_test.cpp - facilities_test.cpp - fastmks_test.cpp - feedforward_network_test.cpp - feedforward_network_2_test.cpp - gan_test.cpp - gmm_test.cpp - hmm_test.cpp - hpt_test.cpp - hoeffding_tree_test.cpp - hyperplane_test.cpp - image_load_test.cpp - imputation_test.cpp - init_rules_test.cpp - io_test.cpp - kde_test.cpp - kernel_pca_test.cpp - kernel_test.cpp - kernel_traits_test.cpp - kfn_test.cpp - kmeans_test.cpp - knn_test.cpp - krann_search_test.cpp - ksinit_test.cpp - lars_test.cpp - layer_names_test.cpp - lin_alg_test.cpp - linear_regression_test.cpp - lmnn_test.cpp - linear_svm_test.cpp + #activation_functions_test.cpp + #adaboost_test.cpp + #akfn_test.cpp + #aknn_test.cpp + #ann_dist_test.cpp + #ann_layer_test.cpp + #ann_regularizer_test.cpp + #ann_test_tools.hpp + #ann_visitor_test.cpp + #armadillo_svd_test.cpp + #arma_extend_test.cpp + #async_learning_test.cpp + #augmented_rnns_tasks_test.cpp + #bayesian_linear_regression_test.cpp + #bias_svd_test.cpp + #binarize_test.cpp + #block_krylov_svd_test.cpp + #callback_test.cpp + #cf_test.cpp + #cli_binding_test.cpp + #convolutional_network_test.cpp + #convolution_test.cpp + #cosine_tree_test.cpp + #cv_test.cpp + #dbscan_test.cpp + #dcgan_test.cpp + #decision_tree_regressor_test.cpp + #decision_tree_test.cpp + #det_test.cpp + #distribution_test.cpp + #drusilla_select_test.cpp + #emst_test.cpp + #facilities_test.cpp + #fastmks_test.cpp + #feedforward_network_test.cpp + #feedforward_network_2_test.cpp + #gan_test.cpp + #gmm_test.cpp + #hmm_test.cpp + #hpt_test.cpp + #hoeffding_tree_test.cpp + #hyperplane_test.cpp + #image_load_test.cpp + #imputation_test.cpp + #init_rules_test.cpp + #io_test.cpp + #kde_test.cpp + #kernel_pca_test.cpp + #kernel_test.cpp + #kernel_traits_test.cpp + #kfn_test.cpp + #kmeans_test.cpp + #knn_test.cpp + #krann_search_test.cpp + #ksinit_test.cpp + #lars_test.cpp + #layer_names_test.cpp + #lin_alg_test.cpp + #linear_regression_test.cpp + #lmnn_test.cpp + #linear_svm_test.cpp load_save_test.cpp - local_coordinate_coding_test.cpp - logistic_regression_test.cpp - log_test.cpp - loss_functions_test.cpp - lsh_test.cpp - main.cpp - math_test.cpp - matrix_completion_test.cpp - maximal_inputs_test.cpp - metric_test.cpp - mean_shift_test.cpp - mock_categorical_data.hpp - nbc_test.cpp - nca_test.cpp - nmf_test.cpp - nystroem_method_test.cpp - octree_test.cpp - one_hot_encoding_test.cpp - pca_test.cpp - perceptron_test.cpp - prefixedoutstream_test.cpp - python_binding_test.cpp - qdafn_test.cpp - quic_svd_test.cpp - q_learning_test.cpp - radical_test.cpp - random_forest_test.cpp - random_test.cpp - randomized_svd_test.cpp - range_search_test.cpp - rbm_network_test.cpp - rectangle_tree_test.cpp - recurrent_network_test.cpp - rnn_reber_test.cpp - regularized_svd_test.cpp - reward_clipping_test.cpp - rl_components_test.cpp - scaling_test.cpp - size_checks_test.cpp - serialization.cpp - serialization.hpp - serialization_test.cpp - sfinae_test.cpp - softmax_regression_test.cpp - sort_policy_test.cpp - sparse_autoencoder_test.cpp - sparse_coding_test.cpp - spill_tree_test.cpp - split_data_test.cpp - string_encoding_test.cpp - sumtree_test.cpp - svd_batch_test.cpp - svd_incremental_test.cpp - svdplusplus_test.cpp - termination_policy_test.cpp + #local_coordinate_coding_test.cpp + #logistic_regression_test.cpp + #log_test.cpp + #loss_functions_test.cpp + #lsh_test.cpp + #main.cpp + #math_test.cpp + #matrix_completion_test.cpp + #maximal_inputs_test.cpp + #metric_test.cpp + #mean_shift_test.cpp + #mock_categorical_data.hpp + #nbc_test.cpp + #nca_test.cpp + #nmf_test.cpp + #nystroem_method_test.cpp + #octree_test.cpp + #one_hot_encoding_test.cpp + #pca_test.cpp + #perceptron_test.cpp + #prefixedoutstream_test.cpp + #python_binding_test.cpp + #qdafn_test.cpp + #quic_svd_test.cpp + #q_learning_test.cpp + #radical_test.cpp + #random_forest_test.cpp + #random_test.cpp + #randomized_svd_test.cpp + #range_search_test.cpp + #rbm_network_test.cpp + #rectangle_tree_test.cpp + #recurrent_network_test.cpp + #rnn_reber_test.cpp + #regularized_svd_test.cpp + #reward_clipping_test.cpp + #rl_components_test.cpp + #scaling_test.cpp + #size_checks_test.cpp + #serialization.cpp + #serialization.hpp + #serialization_test.cpp + #sfinae_test.cpp + #softmax_regression_test.cpp + #sort_policy_test.cpp + #sparse_autoencoder_test.cpp + #sparse_coding_test.cpp + #spill_tree_test.cpp + #split_data_test.cpp + #string_encoding_test.cpp + #sumtree_test.cpp + #svd_batch_test.cpp + #svd_incremental_test.cpp + #svdplusplus_test.cpp + #termination_policy_test.cpp test_catch_tools.hpp - test_function_tools.hpp - timer_test.cpp - tree_test.cpp - tree_traits_test.cpp - ub_tree_test.cpp - union_find_test.cpp - vantage_point_tree_test.cpp - wgan_test.cpp - main_tests/adaboost_test.cpp - main_tests/approx_kfn_test.cpp - main_tests/bayesian_linear_regression_test.cpp - main_tests/cf_test.cpp - main_tests/dbscan_test.cpp - main_tests/decision_tree_test.cpp - main_tests/det_test.cpp - main_tests/emst_test.cpp - main_tests/fastmks_test.cpp - main_tests/gmm_generate_test.cpp - main_tests/gmm_probability_test.cpp - main_tests/gmm_train_test.cpp - main_tests/hmm_generate_test.cpp - main_tests/hmm_loglik_test.cpp - main_tests/hmm_test_utils.hpp - main_tests/hmm_train_test.cpp - main_tests/hmm_viterbi_test.cpp - main_tests/hoeffding_tree_test.cpp - main_tests/image_converter_test.cpp - main_tests/kde_test.cpp - main_tests/kernel_pca_test.cpp - main_tests/kfn_test.cpp - main_tests/kmeans_test.cpp - main_tests/knn_test.cpp - main_tests/krann_test.cpp - main_tests/linear_regression_test.cpp - main_tests/lmnn_test.cpp - main_tests/linear_svm_test.cpp - main_tests/local_coordinate_coding_test.cpp - main_tests/logistic_regression_test.cpp - main_tests/lsh_test.cpp - main_tests/mean_shift_test.cpp - main_tests/nbc_test.cpp - main_tests/nca_test.cpp - main_tests/nmf_test.cpp - main_tests/pca_test.cpp - main_tests/perceptron_test.cpp - main_tests/preprocess_binarize_test.cpp - main_tests/preprocess_imputer_test.cpp - main_tests/preprocess_one_hot_encode_test.cpp - main_tests/preprocess_scale_test.cpp - main_tests/preprocess_split_test.cpp - main_tests/radical_test.cpp - main_tests/random_forest_test.cpp - main_tests/softmax_regression_test.cpp - main_tests/sparse_coding_test.cpp - main_tests/range_search_test.cpp - main_tests/test_helper.hpp + #test_function_tools.hpp + #timer_test.cpp + #tree_test.cpp + #tree_traits_test.cpp + #ub_tree_test.cpp + #union_find_test.cpp + #vantage_point_tree_test.cpp + #wgan_test.cpp + #main_tests/adaboost_test.cpp + #main_tests/approx_kfn_test.cpp + #main_tests/bayesian_linear_regression_test.cpp + #main_tests/cf_test.cpp + #main_tests/dbscan_test.cpp + #main_tests/decision_tree_test.cpp + #main_tests/det_test.cpp + #main_tests/emst_test.cpp + #main_tests/fastmks_test.cpp + #main_tests/gmm_generate_test.cpp + #main_tests/gmm_probability_test.cpp + #main_tests/gmm_train_test.cpp + #main_tests/hmm_generate_test.cpp + #main_tests/hmm_loglik_test.cpp + #main_tests/hmm_test_utils.hpp + #main_tests/hmm_train_test.cpp + #main_tests/hmm_viterbi_test.cpp + #main_tests/hoeffding_tree_test.cpp + #main_tests/image_converter_test.cpp + #main_tests/kde_test.cpp + #main_tests/kernel_pca_test.cpp + #main_tests/kfn_test.cpp + #main_tests/kmeans_test.cpp + #main_tests/knn_test.cpp + #main_tests/krann_test.cpp + #main_tests/linear_regression_test.cpp + #main_tests/lmnn_test.cpp + #main_tests/linear_svm_test.cpp + #main_tests/local_coordinate_coding_test.cpp + #main_tests/logistic_regression_test.cpp + #main_tests/lsh_test.cpp + #main_tests/mean_shift_test.cpp + #main_tests/nbc_test.cpp + #main_tests/nca_test.cpp + #main_tests/nmf_test.cpp + #main_tests/pca_test.cpp + #main_tests/perceptron_test.cpp + #main_tests/preprocess_binarize_test.cpp + #main_tests/preprocess_imputer_test.cpp + #main_tests/preprocess_one_hot_encode_test.cpp + #main_tests/preprocess_scale_test.cpp + #main_tests/preprocess_split_test.cpp + #main_tests/radical_test.cpp + #main_tests/random_forest_test.cpp + #main_tests/softmax_regression_test.cpp + #main_tests/sparse_coding_test.cpp + #main_tests/range_search_test.cpp + #main_tests/test_helper.hpp ) if(NOT BUILD_SHARED_LIBS) diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp index f5bf6d39f69..164a0aeac6a 100644 --- a/src/mlpack/tests/load_save_test.cpp +++ b/src/mlpack/tests/load_save_test.cpp @@ -127,6 +127,7 @@ TEST_CASE("LoadCSVTest", "[LoadSaveTest]") /** * Make sure a TSV is loaded correctly to a sparse matrix. */ +/* TEST_CASE("LoadSparseTSVTest", "[LoadSaveTest]") { fstream f; @@ -163,10 +164,13 @@ TEST_CASE("LoadSparseTSVTest", "[LoadSaveTest]") // Remove the file. remove("test_sparse_file.tsv"); } +*/ + /** * Make sure a CSV in text format is loaded correctly to a sparse matrix. */ +/* TEST_CASE("LoadSparseTXTTest", "[LoadSaveTest]") { fstream f; @@ -202,7 +206,7 @@ TEST_CASE("LoadSparseTXTTest", "[LoadSaveTest]") // Remove the file. remove("test_sparse_file.txt"); } - +*/ /** * Make sure a TSV is loaded correctly. */ @@ -310,6 +314,7 @@ TEST_CASE("SaveCSVTest", "[LoadSaveTest]") /** * Make sure a TSV is saved correctly for a sparse matrix */ +/* TEST_CASE("SaveSparseTSVTest", "[LoadSaveTest]") { arma::sp_mat test = "0.1\t0\t0\t0;" @@ -341,10 +346,11 @@ TEST_CASE("SaveSparseTSVTest", "[LoadSaveTest]") // Remove the file. remove("test_sparse_file.tsv"); } - +*/ /** * Make sure a TXT is saved correctly for a sparse matrix */ +/* TEST_CASE("SaveSparseTXTTest", "[LoadSaveTest]") { arma::sp_mat test = "0.1 0 0 0;" @@ -376,10 +382,11 @@ TEST_CASE("SaveSparseTXTTest", "[LoadSaveTest]") // Remove the file. remove("test_sparse_file.txt"); } - +*/ /** * Make sure a Sparse Matrix is saved and loaded correctly in binary format */ +/* TEST_CASE("SaveSparseBinaryTest", "[LoadSaveTest]") { arma::sp_mat test = "0.1 0 0 0;" @@ -411,7 +418,7 @@ TEST_CASE("SaveSparseBinaryTest", "[LoadSaveTest]") // Remove the file. remove("test_sparse_file.bin"); } - +*/ /** * Make sure CSVs can be loaded in transposed form. */ From 36ab2d3042513a73f1e668b784ae2264e8c8b22e Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 25 Jul 2021 21:58:54 +0530 Subject: [PATCH 050/112] uncommmenting main.cpp from CMakeLists.txt --- src/mlpack/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index 4727fe3d3ba..4f6ac581d29 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -70,7 +70,7 @@ add_executable(mlpack_test #log_test.cpp #loss_functions_test.cpp #lsh_test.cpp - #main.cpp + main.cpp #math_test.cpp #matrix_completion_test.cpp #maximal_inputs_test.cpp From 4b1d111056178f043ac34faa260b9265cfb774dc Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 25 Jul 2021 22:08:25 +0530 Subject: [PATCH 051/112] Removing all instances of boost::trim --- src/mlpack/core/data/load_csv.hpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index fbba9c22898..385422cdda3 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -33,7 +33,6 @@ #define MLPACK_CORE_DATA_LOAD_CSV_HPP #include -#include #include @@ -337,16 +336,12 @@ class LoadCSV { str.clear(); } - boost::trim(str); inout(row, col++) = infoSet.template MapString(std::move(str), row); }; while (std::getline(inFile, line)) { - // Remove whitespace from either side. - boost::trim(line); - // Parse the numbers from a line (ex: 1,2,3,4); if the parser finds a // number it will execute the setNum function. const bool canParse = qi::parse(line.begin(), line.end(), @@ -407,7 +402,6 @@ class LoadCSV { // All parsed values must be mapped. std::string str(iter.begin(), iter.end()); - boost::trim(str); inout(row, col) = infoSet.template MapString(std::move(str), row); ++row; @@ -415,9 +409,6 @@ class LoadCSV while (std::getline(inFile, line)) { - // Remove whitespace from either side. - boost::trim(line); - // Reset the row we are looking at. (Remember this is transposed.) row = 0; From 632c0131ae3e24f690e9759458fd51b208fc851e Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 25 Jul 2021 22:13:06 +0530 Subject: [PATCH 052/112] chaning int to size_t in size comparison --- src/mlpack/core/data/load_csv.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 385422cdda3..c0ad45e4b63 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -201,7 +201,7 @@ class LoadCSV // In this case we must pass everything we parse to the MapPolicy. std::string str(line.begin(), line.end()); - for(int i = 0; i < str.size(); i++) + for(size_t i = 0; i < str.size(); i++) { // Maybe there is a faster way to parser each element of the string // Also for now it is being considered that delimiter will always @@ -282,7 +282,7 @@ class LoadCSV // Maybe there is a faster way to parser each element of the string // Also for now it is being considered that delimiter will always // be comma(,) - for(int i = 0; i < str.size(); i++) + for(size_t i = 0; i < str.size(); i++) { if(str[i] != ',') { From a88a1ae92f58a836590743b5405ded542dbf664a Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 30 Jul 2021 05:02:50 +0530 Subject: [PATCH 053/112] Removed boost spirit. Still using boost::trim. Failing cases must be due to quoted strings --- src/mlpack/core/data/load_csv.cpp | 20 +-- src/mlpack/core/data/load_csv.hpp | 199 +++++++++++++++---------- src/mlpack/core/data/load_csv_impl.hpp | 7 +- 3 files changed, 129 insertions(+), 97 deletions(-) diff --git a/src/mlpack/core/data/load_csv.cpp b/src/mlpack/core/data/load_csv.cpp index d95843dae1f..22178b3f16f 100644 --- a/src/mlpack/core/data/load_csv.cpp +++ b/src/mlpack/core/data/load_csv.cpp @@ -10,18 +10,13 @@ * 3-clause BSD license along with mlpack. If not, see * http://www.opensource.org/licenses/BSD-3-Clause for more information. */ -#include "load_csv.hpp" +/*#include "load_csv.hpp" using namespace boost::spirit; namespace mlpack { namespace data { -LoadCSV::LoadCSV() -{ - // Nothing to do here. -} - LoadCSV::LoadCSV(const std::string& file) : extension(Extension(file)), filename(file), @@ -76,17 +71,6 @@ LoadCSV::LoadCSV(const std::string& file) : } } -void LoadCSV::CheckOpen() -{ - if (!inFile.is_open()) - { - std::ostringstream oss; - oss << "Cannot open file '" << filename << "'. " << std::endl; - throw std::runtime_error(oss.str()); - } - - inFile.unsetf(std::ios::skipws); -} - } // namespace data } // namespace mlpack +*/ diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index c0ad45e4b63..27e5f8748df 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -32,10 +32,8 @@ #ifndef MLPACK_CORE_DATA_LOAD_CSV_HPP #define MLPACK_CORE_DATA_LOAD_CSV_HPP -#include - #include - +#include #include #include @@ -56,13 +54,36 @@ class LoadCSV { public: + char delim; // Do nothing, just a place holder, to be removed later. - LoadCSV(); + LoadCSV() + { + } + /** * Construct the LoadCSV object on the given file. This will construct the * rules necessary for loading and attempt to open the file. */ - LoadCSV(const std::string& file); + LoadCSV(const std::string& file) : + extension(Extension(file)), + filename(file), + inFile(file) + { + if(extension == "csv") + { + delim = ','; + } + else if(extension == "tsv") + { + delim = '\t'; + } + else + { + delim = ' '; + } + + CheckOpen(); + } /** * Convert the given string token to assigned datatype and assign @@ -131,8 +152,6 @@ class LoadCSV template void GetMatrixSize(size_t& rows, size_t& cols, DatasetMapper& info) { - using namespace boost::spirit; - // Take a pass through the file. If the DatasetMapper policy requires it, // we will pass everything string through MapString(). This might be useful // if, e.g., the MapPolicy needs to find which dimensions are numeric or @@ -186,11 +205,12 @@ class LoadCSV while (std::getline(inFile, line)) { + boost::trim(line); ++rows; if (rows == 1) { // Extract the number of columns. - std::pair dimen = GetMatSize(inFile); + std::pair dimen = GetMatSize(inFile, delim); cols = dimen.second; } @@ -200,19 +220,24 @@ class LoadCSV { // In this case we must pass everything we parse to the MapPolicy. std::string str(line.begin(), line.end()); - - for(size_t i = 0; i < str.size(); i++) + + std::stringstream line_stream; + std::string token; + + if(line.size() == 0) { - // Maybe there is a faster way to parser each element of the string - // Also for now it is being considered that delimiter will always - // be comma(,) - if(str[i] != ',') - { - std::string cc(1, str[i]); - info.template MapFirstPass(std::move(cc), rows - 1); - } - } + break; + } + + line_stream.clear(); + line_stream.str(line); + while(line_stream.good()) + { + std::getline(line_stream, token, delim); + boost::trim(token); + info.template MapFirstPass(std::move(token), rows - 1); + } } } } @@ -232,8 +257,6 @@ class LoadCSV size_t& cols, DatasetMapper& info) { - using namespace boost::spirit; - // Take a pass through the file. If the DatasetMapper policy requires it, // we will pass everything string through MapString(). This might be useful // if, e.g., the MapPolicy needs to find which dimensions are numeric or @@ -249,11 +272,13 @@ class LoadCSV while (std::getline(inFile, line)) { ++cols; + + boost::trim(line); if (cols == 1) { // Extract the number of dimensions. - std::pair dimen = GetMatSize(inFile); + std::pair dimen = GetMatSize(inFile, delim); rows = dimen.second; // Reset the DatasetInfo object, if needed. @@ -276,32 +301,43 @@ class LoadCSV { size_t dim = 0; - // In this case we must pass everything we parse to the MapPolicy. - std::string str(line.begin(), line.end()); + std::stringstream line_stream; + std::string token; + + if(line.size() == 0) + { + break; + } + + line_stream.clear(); + line_stream.str(line); - // Maybe there is a faster way to parser each element of the string - // Also for now it is being considered that delimiter will always - // be comma(,) - for(size_t i = 0; i < str.size(); i++) + while(line_stream.good()) { - if(str[i] != ',') - { - std::string cc(1, str[i]); - info.template MapFirstPass(std::move(cc), dim++); - } + std::getline(line_stream, token, delim); + boost::trim(token); + info.template MapFirstPass(std::move(token), dim++); } } } } private: - using iter_type = boost::iterator_range; - /** * Check whether or not the file has successfully opened; throw an exception * if not. */ - void CheckOpen(); + void CheckOpen() + { + if (!inFile.is_open()) + { + std::ostringstream oss; + oss << "Cannot open file '" << filename << "'. " << std::endl; + throw std::runtime_error(oss.str()); + } + + inFile.unsetf(std::ios::skipws); + } /** * Parse a non-transposed matrix. @@ -313,8 +349,6 @@ class LoadCSV void NonTransposeParse(arma::Mat& inout, DatasetMapper& infoSet) { - using namespace boost::spirit; - // Get the size of the matrix. size_t rows, cols; GetMatrixSize(rows, cols, infoSet); @@ -329,24 +363,34 @@ class LoadCSV inFile.clear(); inFile.seekg(0, std::ios::beg); - auto setCharClass = [&](iter_type const &iter) + while (std::getline(inFile, line)) { - std::string str(iter.begin(), iter.end()); - if (str == "\t") + + boost::trim(line); + + const bool canParse = true; + std::stringstream line_stream; + std::string token; + + if(line.size() == 0) { - str.clear(); + break; } - inout(row, col++) = infoSet.template MapString(std::move(str), row); - }; + line_stream.clear(); + line_stream.str(line); - while (std::getline(inFile, line)) - { - // Parse the numbers from a line (ex: 1,2,3,4); if the parser finds a - // number it will execute the setNum function. - const bool canParse = qi::parse(line.begin(), line.end(), - stringRule[setCharClass] % delimiterRule); + while(line_stream.good()) + { + if(token == "\t") + { + token.clear(); + } + std::getline(line_stream, token, delim); + boost::trim(token); + inout(row, col++) = infoSet.template MapString(std::move(token), row); + } // Make sure we got the right number of rows. if (col != cols) { @@ -357,6 +401,10 @@ class LoadCSV throw std::runtime_error(oss.str()); } + // I am not able to understand when can we enter this case. + // I am looking into it, if anyone can give me some hint + // it might help, currently I've assigned canParse as true + // by default if (!canParse) { std::ostringstream oss; @@ -378,8 +426,6 @@ class LoadCSV template void TransposeParse(arma::Mat& inout, DatasetMapper& infoSet) { - using namespace boost::spirit; - // Get matrix size. This also initializes infoSet correctly. size_t rows, cols; GetTransposeMatrixSize(rows, cols, infoSet); @@ -394,29 +440,31 @@ class LoadCSV inFile.clear(); inFile.seekg(0, std::ios::beg); - /** - * This is the parse rule for strings. When we get a string we have to pass - * it to the DatasetMapper. - */ - auto parseString = [&](iter_type const &iter) - { - // All parsed values must be mapped. - std::string str(iter.begin(), iter.end()); - - inout(row, col) = infoSet.template MapString(std::move(str), row); - ++row; - }; - while (std::getline(inFile, line)) { + boost::trim(line); // Reset the row we are looking at. (Remember this is transposed.) row = 0; + const bool canParse = true; + std::stringstream line_stream; + std::string token; - // Now use boost::spirit to parse the characters of the line; - // parseString() will be called when a token is detected. - const bool canParse = qi::parse(line.begin(), line.end(), - stringRule[parseString] % delimiterRule); + if(line.size() == 0) + { + break; + } + + line_stream.clear(); + line_stream.str(line); + while(line_stream.good()) + { + std::getline(line_stream, token, delim); + boost::trim(token); + inout(row, col) = infoSet.template MapString(std::move(token), row); + row++; + } + // Make sure we got the right number of rows. if (row != rows) { @@ -425,7 +473,11 @@ class LoadCSV << ") on line " << col << "; should be " << rows << " dimensions."; throw std::runtime_error(oss.str()); } - + + // I am not able to understand when can we enter this case. + // I am looking into it, if anyone can give me some hint + // it might help, currently I've assigned canParser as true + // by default if (!canParse) { std::ostringstream oss; @@ -439,12 +491,7 @@ class LoadCSV } } - inline std::pair GetMatSize(std::ifstream& f); - - //! Spirit rule for parsing. - boost::spirit::qi::rule stringRule; - //! Spirit rule for delimiters (i.e. ',' for CSVs). - boost::spirit::qi::rule delimiterRule; + inline std::pair GetMatSize(std::ifstream& f, const char delim); //! Extension (type) of file. std::string extension; diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp index 994516f628c..6bf0b75b587 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_csv_impl.hpp @@ -42,6 +42,7 @@ namespace mlpack { namespace data { + /** * Given the address of a martix element(val) * sets it equal to the provided value(token) @@ -119,7 +120,7 @@ namespace data return true; } - inline std::pair LoadCSV::GetMatSize(std::ifstream& f) + inline std::pair LoadCSV::GetMatSize(std::ifstream& f, const char delim = ',') { bool load_okay = f.good(); @@ -149,7 +150,7 @@ namespace data while (line_stream.good()) { - std::getline(line_stream, token, ','); + std::getline(line_stream, token, delim); ++line_n_cols; } @@ -182,7 +183,7 @@ namespace data std::pair mat_size = GetMatSize(f); - x.zeros(mat_size.second, mat_size.first); + x.zeros(mat_size.first, mat_size.second); size_t row = 0; From cf87cb9ad096a509dce0038c0cb2b6f636356333 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sat, 31 Jul 2021 00:17:37 +0530 Subject: [PATCH 054/112] Removing load.cpp which contains boost::spirit code --- src/mlpack/core/data/load_csv.cpp | 76 ------------------------------- 1 file changed, 76 deletions(-) delete mode 100644 src/mlpack/core/data/load_csv.cpp diff --git a/src/mlpack/core/data/load_csv.cpp b/src/mlpack/core/data/load_csv.cpp deleted file mode 100644 index 22178b3f16f..00000000000 --- a/src/mlpack/core/data/load_csv.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/** - * @file core/data/load_csv.cpp - * @author Tham Ngap Wei - * @author Mehul Kumar Nirala - * - * A CSV reader that uses boost::spirit. - * - * mlpack is free software; you may redistribute it and/or modify it under the - * terms of the 3-clause BSD license. You should have received a copy of the - * 3-clause BSD license along with mlpack. If not, see - * http://www.opensource.org/licenses/BSD-3-Clause for more information. - */ -/*#include "load_csv.hpp" - -using namespace boost::spirit; - -namespace mlpack { -namespace data { - -LoadCSV::LoadCSV(const std::string& file) : - extension(Extension(file)), - filename(file), - inFile(file) -{ - // Attempt to open stream. - CheckOpen(); - - //! Spirit rule for parsing quoted string. - boost::spirit::qi::rule quotedRule; - // Match quoted strings as: "string" or 'string' - quotedRule = qi::raw[(qi::char_("'") >> *((qi::char_ - "'") | - "'" >> qi::char_("'")) >> "'") | - (qi::char_('"') >> *((qi::char_ - '"') | - '"' >> qi::char_('"')) >> '"') ]; - - // Set rules. - if (extension == "csv") - { - // Match all characters that are not ',', '\r', or '\n'. - stringRule = quotedRule.copy() | qi::raw[*~qi::char_(",\r\n")]; - } - else if (extension == "txt") - { - // Match all characters that are not ' ', ',', '\r', or '\n'. - stringRule = quotedRule.copy() | qi::raw[*~qi::char_(" ,\r\n")]; - } - else // TSV. - { - // Match all characters that are not '\t', '\r', or '\n'. - stringRule = quotedRule.copy() | qi::raw[*~qi::char_("\t\r\n")]; - } - - if (extension == "csv") - { - // Extract a single comma as the delimiter, catching whitespace on either - // side. - delimiterRule = qi::raw[(*qi::char_(" ") >> qi::char_(",") >> - *qi::char_(" "))]; - } - else if (extension == "txt") - { - // This one is a little more difficult, we need to catch any number of - // spaces more than one. - delimiterRule = qi::raw[+qi::char_(" ")]; - } - else // TSV. - { - // Catch a tab character, possibly with whitespace on either side. - delimiterRule = qi::raw[(*qi::char_(" ") >> qi::char_("\t") >> - *qi::char_(" "))]; - } -} - -} // namespace data -} // namespace mlpack -*/ From 0ddd6abc8c7479dcd6d84755fb578d866934bc05 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sat, 31 Jul 2021 00:18:40 +0530 Subject: [PATCH 055/112] trim() implementation to replace boost::trim() --- src/mlpack/core/data/string_algorithms.hpp | 50 ++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 src/mlpack/core/data/string_algorithms.hpp diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp new file mode 100644 index 00000000000..736054e0787 --- /dev/null +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -0,0 +1,50 @@ +/** + * @file core/data/string_algorithms.hpp + * @author Gopi M. Tatiraju + * + * Utility fucntions related to string manipulation + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MLPACK_CORE_DATA_STRING_ALGORITHMS_HPP +#define MLPACK_CORE_DATA_STRING_ALGORITHMS_HPP + +namespace mlpack{ +namespace data{ + +/** + * A simple trim fucntion to strip off whitespaces + * from both the side of string. + */ +inline std::string trim(std::string str) +{ + size_t startIndex = 0; + + while(std::isspace(str[startIndex])) + { + startIndex++; + } + + size_t endIndex = str.size() - 1; + + while(std::isspace(str[endIndex])) + { + endIndex--; + } + + std::string trimmedStr = (endIndex - startIndex == str.size()) ? + std::move(str) : str.substr(startIndex, endIndex - startIndex + 1); + + return trimmedStr; +} + +} // namespace data +} // namespace mlpack + +#endif + + From ed8e614721d452003dd8cdf96e19c2829739dfef Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sat, 31 Jul 2021 00:19:22 +0530 Subject: [PATCH 056/112] Removing load.cpp and adding string_algorithms.hpp --- src/mlpack/core/data/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index d0c029a829b..fccbd9892c5 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -10,7 +10,6 @@ set(SOURCES has_serialize.hpp is_naninf.hpp load_csv.hpp - load_csv.cpp load.hpp load_image_impl.hpp load_image.cpp @@ -25,6 +24,7 @@ set(SOURCES save_impl.hpp save_image.cpp split_data.hpp + string_algorithms.hpp imputer.hpp binarize.hpp string_encoding.hpp From 1fa7b645b63090830355d1b13f571fe2baa26ea5 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sat, 31 Jul 2021 00:20:28 +0530 Subject: [PATCH 057/112] replace boost::trim() with mlpack::data::trim() --- src/mlpack/core/data/detect_file_type.cpp | 4 ++-- src/mlpack/core/data/load_arff_impl.hpp | 14 ++++++++------ src/mlpack/core/data/load_csv.hpp | 23 +++++++++++++---------- src/mlpack/core/data/load_impl.hpp | 4 ++-- 4 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/mlpack/core/data/detect_file_type.cpp b/src/mlpack/core/data/detect_file_type.cpp index 9e2eee79e5d..f51bef087a0 100644 --- a/src/mlpack/core/data/detect_file_type.cpp +++ b/src/mlpack/core/data/detect_file_type.cpp @@ -14,8 +14,8 @@ */ #include "extension.hpp" #include "detect_file_type.hpp" +#include "string_algorithms.hpp" -#include #include #include @@ -214,7 +214,7 @@ file_type AutoDetect(std::fstream& stream, const std::string& filename) const std::streampos pos = stream.tellg(); std::string line; std::getline(stream, line, '\n'); - boost::trim(line); + trim(line); // Reset stream position. stream.seekg(pos); diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 0cb7903f808..07c3a9d3f9d 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -15,7 +15,9 @@ // In case it hasn't been included yet. #include "load_arff.hpp" +#include "string_algorithms.hpp" #include + #include "is_naninf.hpp" namespace mlpack { @@ -47,7 +49,7 @@ void LoadARFF(const std::string& filename, { // Read the next line, then strip whitespace from either side. std::getline(ifs, line, '\n'); - boost::trim(line); + trim(line); ++headerLines; // Is the first character a comment, or is the line empty? @@ -103,7 +105,7 @@ void LoadARFF(const std::string& filename, // `origDimType` string here instead (which has not had ::tolower used // on it). types.push_back(true); - boost::trim_if(origDimType, + boost::trim_if(origDimType, [](char c) { return c == '{' || c == '}' || c == ' ' || c == '\t'; @@ -117,7 +119,7 @@ void LoadARFF(const std::string& filename, while (it != dimTok.end()) { std::string category = (*it); - boost::trim(category); + trim(category); categories.push_back(category); ++it; @@ -199,7 +201,7 @@ void LoadARFF(const std::string& filename, while (ifs.good()) { std::getline(ifs, line, '\n'); - boost::trim(line); + trim(line); // Each line of the @data section must be a CSV (except sparse data, which // we will handle later). So now we can tokenize the // CSV and parse it. The '?' representing a missing value is not allowed, @@ -233,7 +235,7 @@ void LoadARFF(const std::string& filename, { // Strip spaces before mapping. std::string token = *it; - boost::trim(token); + trim(token); const size_t currentNumMappings = info.NumMappings(col); const eT result = info.template MapString(token, col); @@ -273,7 +275,7 @@ void LoadARFF(const std::string& filename, // error, otherwise we issue a general error. std::stringstream error; std::string tokenStr = token.str(); - boost::trim(tokenStr); + trim(tokenStr); if (tokenStr == "?") error << "Missing values ('?') not supported, "; else diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 27e5f8748df..d829c70aad1 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -33,10 +33,10 @@ #define MLPACK_CORE_DATA_LOAD_CSV_HPP #include -#include #include #include +#include "string_algorithms.hpp" #include "extension.hpp" #include "format.hpp" #include "dataset_mapper.hpp" @@ -77,8 +77,10 @@ class LoadCSV { delim = '\t'; } - else + else if(extension == "txt") { + // Can we have a case where number + // of spaces is more than 1 delim = ' '; } @@ -205,7 +207,7 @@ class LoadCSV while (std::getline(inFile, line)) { - boost::trim(line); + trim(line); ++rows; if (rows == 1) { @@ -235,7 +237,7 @@ class LoadCSV while(line_stream.good()) { std::getline(line_stream, token, delim); - boost::trim(token); + trim(token); info.template MapFirstPass(std::move(token), rows - 1); } } @@ -273,7 +275,7 @@ class LoadCSV { ++cols; - boost::trim(line); + trim(line); if (cols == 1) { @@ -315,7 +317,7 @@ class LoadCSV while(line_stream.good()) { std::getline(line_stream, token, delim); - boost::trim(token); + trim(token); info.template MapFirstPass(std::move(token), dim++); } } @@ -366,7 +368,7 @@ class LoadCSV while (std::getline(inFile, line)) { - boost::trim(line); + trim(line); const bool canParse = true; std::stringstream line_stream; @@ -388,7 +390,7 @@ class LoadCSV } std::getline(line_stream, token, delim); - boost::trim(token); + trim(token); inout(row, col++) = infoSet.template MapString(std::move(token), row); } // Make sure we got the right number of rows. @@ -442,7 +444,7 @@ class LoadCSV while (std::getline(inFile, line)) { - boost::trim(line); + trim(line); // Reset the row we are looking at. (Remember this is transposed.) row = 0; const bool canParse = true; @@ -460,7 +462,8 @@ class LoadCSV while(line_stream.good()) { std::getline(line_stream, token, delim); - boost::trim(token); + trim(token); + inout(row, col) = infoSet.template MapString(std::move(token), row); row++; } diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 7be7abbedb1..f4cd950025b 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -23,7 +23,7 @@ #include "detect_file_type.hpp" #include "types.hpp" -#include +#include "string_algorithms.hpp" #include #include @@ -41,7 +41,7 @@ std::vector ToTokens(Tokenizer& lineTok) [&tokens](std::string const &str) { std::string trimmedToken(str); - boost::trim(trimmedToken); + trim(trimmedToken); return std::move(trimmedToken); }); From 9ac7cd18a84e5ec4b449c21e335cc1c541ae989d Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 4 Aug 2021 05:47:13 +0530 Subject: [PATCH 058/112] Handling prasing --- src/mlpack/core/data/load_csv.hpp | 144 ++++++++++++++++++++- src/mlpack/core/data/load_csv_impl.hpp | 72 +++++++++++ src/mlpack/core/data/string_algorithms.hpp | 10 +- 3 files changed, 222 insertions(+), 4 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index d829c70aad1..7e4c34b7433 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -212,7 +212,7 @@ class LoadCSV if (rows == 1) { // Extract the number of columns. - std::pair dimen = GetMatSize(inFile, delim); + std::pair dimen = GetNonNumericMatSize(inFile, delim); cols = dimen.second; } @@ -238,6 +238,40 @@ class LoadCSV { std::getline(line_stream, token, delim); trim(token); + + /*size_t found = token.find('"'); + + if(found != std::string::npos) + { + std::string firstPart = token + ","; + std::string secondPart; + + std::getline(line_stream, secondPart, delim); + token = firstPart + secondPart; + } + */ + + if((token[0] == '"' || (token[0] == '\\' && token[1] == '"') + ) && token[token.size() - 1] != '"') + { + /* + token += delim; + std::string part; + std::getline(line_stream, part, delim); + token += part; + */ + std::string tok = token; + + while(token[token.size() - 1] != '"') + { + tok += delim; + std::getline(line_stream, token, delim); + tok += token; + } + + token = tok; + } + info.template MapFirstPass(std::move(token), rows - 1); } } @@ -280,7 +314,7 @@ class LoadCSV if (cols == 1) { // Extract the number of dimensions. - std::pair dimen = GetMatSize(inFile, delim); + std::pair dimen = GetNonNumericMatSize(inFile, delim); rows = dimen.second; // Reset the DatasetInfo object, if needed. @@ -318,6 +352,41 @@ class LoadCSV { std::getline(line_stream, token, delim); trim(token); + + /*size_t found = token.find('"'); + + if(found != std::string::npos) + { + std::string firstPart = token + ","; + std::string secondPart; + + std::getline(line_stream, secondPart, delim); + token = firstPart + secondPart; + } + */ + + if((token[0] == '"' || (token[0] == '\\' && token[1] == '"') + ) && token[token.size() - 1] != '"') + { + /* + token += delim; + std::string part; + std::getline(line_stream, part, delim); + token += part; + */ + + std::string tok = token; + + while(token[token.size() - 1] != '"') + { + tok += delim; + std::getline(line_stream, token, delim); + tok += token; + } + + token = tok; + } + info.template MapFirstPass(std::move(token), dim++); } } @@ -391,6 +460,41 @@ class LoadCSV std::getline(line_stream, token, delim); trim(token); + + /*size_t found = token.find('"'); + + if(found != std::string::npos) + { + std::string firstPart = token + ","; + std::string secondPart; + + std::getline(line_stream, secondPart, delim); + token = firstPart + secondPart; + } + */ + + if((token[0] == '"' || (token[0] == '\\' && token[1] == '"') + ) && token[token.size() - 1] != '"') + { + /* + token += delim; + std::string part; + std::getline(line_stream, part, delim); + token += part; + */ + + std::string tok = token; + + while(token[token.size() - 1] != '"') + { + tok += delim; + std::getline(line_stream, token, delim); + tok += token; + } + + token = tok; + } + inout(row, col++) = infoSet.template MapString(std::move(token), row); } // Make sure we got the right number of rows. @@ -464,6 +568,41 @@ class LoadCSV std::getline(line_stream, token, delim); trim(token); + /*size_t found = token.find('"'); + + if(found != std::string::npos) + { + std::string firstPart = token + ","; + std::string secondPart; + + std::getline(line_stream, secondPart, delim); + token = firstPart + secondPart; + } + */ + + if((token[0] == '"' || (token[0] == '\\' && token[1] == '"') + ) && token[token.size() - 1] != '"') + { + + /* + token += delim; + std::string part; + std::getline(line_stream, part, delim); + token += part; + */ + // first part of the string + std::string tok = token; + + while(token[token.size() - 1] != '"') + { + tok += delim; + std::getline(line_stream, token, delim); + tok += token; + } + + token = tok; + } + inout(row, col) = infoSet.template MapString(std::move(token), row); row++; } @@ -496,6 +635,7 @@ class LoadCSV inline std::pair GetMatSize(std::ifstream& f, const char delim); + inline std::pair GetNonNumericMatSize(std::ifstream& f, const char delim); //! Extension (type) of file. std::string extension; //! Name of file. diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp index 6bf0b75b587..0c4c9759097 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_csv_impl.hpp @@ -170,6 +170,78 @@ namespace data return mat_size; } + + inline std::pair LoadCSV::GetNonNumericMatSize(std::ifstream& f, const char delim = ',') + { + bool load_okay = f.good(); + + f.clear(); + + const std::fstream::pos_type pos1 = f.tellg(); + + size_t f_n_rows = 0; + size_t f_n_cols = 0; + + std::string line_string; + std::stringstream line_stream; + std::string token; + + while (f.good() && load_okay) + { + std::getline(f, line_string); + if (line_string.size() == 0) + { + break; + } + line_stream.clear(); + line_stream.str(line_string); + + size_t line_n_cols = 0; + + while (line_stream.good()) + { + std::getline(line_stream, token, delim); + + /*size_t found = token.find('"'); + + if(found == std::string::npos) + { + ++line_n_cols; + } + else + { + std::getline(line_stream, token, delim); + ++line_n_cols; + } + */ + + if((token[0] == '"' || (token[0] == '\\' && token[1] == '"') + ) && token[token.size() - 1] != '"') + { + while(token[token.size() - 1] != '"') + { + std::getline(line_stream, token, delim); + } + } + + ++line_n_cols; + } + + if (f_n_cols < line_n_cols) + { + f_n_cols = line_n_cols; + } + + ++f_n_rows; + } + + f.clear(); + f.seekg(pos1); + + std::pair mat_size(f_n_rows, f_n_cols); + + return mat_size; + } /** * Returns a bool value showing whether data was loaded successfully or not. * Parses the file and loads the data into the given matrix. diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 736054e0787..74fdbdb4e56 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -20,8 +20,14 @@ namespace data{ * A simple trim fucntion to strip off whitespaces * from both the side of string. */ -inline std::string trim(std::string str) +inline void trim(std::string& str) { + if(str.size() < 2) + { + str = ""; + return; + } + size_t startIndex = 0; while(std::isspace(str[startIndex])) @@ -39,7 +45,7 @@ inline std::string trim(std::string str) std::string trimmedStr = (endIndex - startIndex == str.size()) ? std::move(str) : str.substr(startIndex, endIndex - startIndex + 1); - return trimmedStr; + str = trimmedStr; } } // namespace data From b32a9131b5ae5506f69f9bf2382128084ad8aded Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 5 Aug 2021 01:36:07 +0530 Subject: [PATCH 059/112] Solving bug in trim fucntion --- src/mlpack/core/data/load_csv.hpp | 12 ++++-------- src/mlpack/core/data/load_csv_impl.hpp | 3 +-- src/mlpack/core/data/string_algorithms.hpp | 1 - 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 7e4c34b7433..d364bc92c3c 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -251,8 +251,7 @@ class LoadCSV } */ - if((token[0] == '"' || (token[0] == '\\' && token[1] == '"') - ) && token[token.size() - 1] != '"') + if(token[0] == '"' && token[token.size() - 1] != '"') { /* token += delim; @@ -365,8 +364,7 @@ class LoadCSV } */ - if((token[0] == '"' || (token[0] == '\\' && token[1] == '"') - ) && token[token.size() - 1] != '"') + if(token[0] == '"' && token[token.size() - 1] != '"') { /* token += delim; @@ -473,8 +471,7 @@ class LoadCSV } */ - if((token[0] == '"' || (token[0] == '\\' && token[1] == '"') - ) && token[token.size() - 1] != '"') + if(token[0] == '"' && token[token.size() - 1] != '"') { /* token += delim; @@ -580,8 +577,7 @@ class LoadCSV } */ - if((token[0] == '"' || (token[0] == '\\' && token[1] == '"') - ) && token[token.size() - 1] != '"') + if(token[0] == '"' && token[token.size() - 1] != '"') { /* diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp index 0c4c9759097..ed85867750d 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_csv_impl.hpp @@ -215,8 +215,7 @@ namespace data } */ - if((token[0] == '"' || (token[0] == '\\' && token[1] == '"') - ) && token[token.size() - 1] != '"') + if(token[0] == '"' && token[token.size() - 1] != '"') { while(token[token.size() - 1] != '"') { diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 74fdbdb4e56..40a96df0258 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -24,7 +24,6 @@ inline void trim(std::string& str) { if(str.size() < 2) { - str = ""; return; } From 0201984c7ef25a54dc291d524316816ab365f9cd Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 5 Aug 2021 01:48:48 +0530 Subject: [PATCH 060/112] Handling string containg only space --- src/mlpack/core/data/string_algorithms.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 40a96df0258..2858c0035f1 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -22,11 +22,16 @@ namespace data{ */ inline void trim(std::string& str) { - if(str.size() < 2) + /*if(str.size() < 2) { return; - } + }*/ + if(str.find_first_not_of(' ') == std::string::npos) + { + str = ""; + return; + } size_t startIndex = 0; while(std::isspace(str[startIndex])) From 3b70751168d7bd41dfc270a24005d2b2a3f27991 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 5 Aug 2021 06:54:08 +0530 Subject: [PATCH 061/112] CSV files with header --- src/mlpack/core/data/load_csv.hpp | 4 ++-- src/mlpack/core/data/load_csv_impl.hpp | 4 ++-- src/mlpack/core/data/load_impl.hpp | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index d364bc92c3c..00661e719b5 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -117,7 +117,7 @@ class LoadCSV * @param f File stream to access the data file */ template - bool LoadCSVFile(MatType& x, std::ifstream& f); + bool LoadCSVFile(MatType& x, std::fstream& f); /** * Load the file into the given matrix with the given DatasetMapper object. @@ -629,7 +629,7 @@ class LoadCSV } } - inline std::pair GetMatSize(std::ifstream& f, const char delim); + inline std::pair GetMatSize(std::fstream& f, const char delim); inline std::pair GetNonNumericMatSize(std::ifstream& f, const char delim); //! Extension (type) of file. diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp index ed85867750d..2d264a741de 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_csv_impl.hpp @@ -120,7 +120,7 @@ namespace data return true; } - inline std::pair LoadCSV::GetMatSize(std::ifstream& f, const char delim = ',') + inline std::pair LoadCSV::GetMatSize(std::fstream& f, const char delim = ',') { bool load_okay = f.good(); @@ -246,7 +246,7 @@ namespace data * Parses the file and loads the data into the given matrix. */ template - bool LoadCSV::LoadCSVFile(MatType& x, std::ifstream& f) + bool LoadCSV::LoadCSVFile(MatType& x, std::fstream& f) { bool load_okay = f.good(); diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index f4cd950025b..3dae41adf82 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -94,7 +94,6 @@ bool Load(const std::string& filename, // Catch nonexistent files by opening the stream ourselves. std::fstream stream; - std::ifstream istream(filename); #ifdef _WIN32 // Always open in binary mode on Windows. stream.open(filename.c_str(), std::fstream::in | std::fstream::binary); @@ -169,7 +168,7 @@ bool Load(const std::string& filename, if (loadType != file_type::HDF5Binary) { if(loadType == file_type::CSVASCII) - success = load.LoadCSVFile(matrix, istream); + success = load.LoadCSVFile(matrix, stream); else success = matrix.load(stream, ToArmaFileType(loadType)); } From 81ebd0ab204876fecf89aa63930a9456fecaaab9 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 5 Aug 2021 09:18:28 +0530 Subject: [PATCH 062/112] Uncommenting other tests --- src/mlpack/tests/CMakeLists.txt | 342 ++++++++++++++++---------------- 1 file changed, 171 insertions(+), 171 deletions(-) diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index 4f6ac581d29..33b10aa735d 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -3,180 +3,180 @@ include(CTest) # mlpack test executable. add_executable(mlpack_test EXCLUDE_FROM_ALL - #activation_functions_test.cpp - #adaboost_test.cpp - #akfn_test.cpp - #aknn_test.cpp - #ann_dist_test.cpp - #ann_layer_test.cpp - #ann_regularizer_test.cpp - #ann_test_tools.hpp - #ann_visitor_test.cpp - #armadillo_svd_test.cpp - #arma_extend_test.cpp - #async_learning_test.cpp - #augmented_rnns_tasks_test.cpp - #bayesian_linear_regression_test.cpp - #bias_svd_test.cpp - #binarize_test.cpp - #block_krylov_svd_test.cpp - #callback_test.cpp - #cf_test.cpp - #cli_binding_test.cpp - #convolutional_network_test.cpp - #convolution_test.cpp - #cosine_tree_test.cpp - #cv_test.cpp - #dbscan_test.cpp - #dcgan_test.cpp - #decision_tree_regressor_test.cpp - #decision_tree_test.cpp - #det_test.cpp - #distribution_test.cpp - #drusilla_select_test.cpp - #emst_test.cpp - #facilities_test.cpp - #fastmks_test.cpp - #feedforward_network_test.cpp - #feedforward_network_2_test.cpp - #gan_test.cpp - #gmm_test.cpp - #hmm_test.cpp - #hpt_test.cpp - #hoeffding_tree_test.cpp - #hyperplane_test.cpp - #image_load_test.cpp - #imputation_test.cpp - #init_rules_test.cpp - #io_test.cpp - #kde_test.cpp - #kernel_pca_test.cpp - #kernel_test.cpp - #kernel_traits_test.cpp - #kfn_test.cpp - #kmeans_test.cpp - #knn_test.cpp - #krann_search_test.cpp - #ksinit_test.cpp - #lars_test.cpp - #layer_names_test.cpp - #lin_alg_test.cpp - #linear_regression_test.cpp - #lmnn_test.cpp - #linear_svm_test.cpp + activation_functions_test.cpp + adaboost_test.cpp + akfn_test.cpp + aknn_test.cpp + ann_dist_test.cpp + ann_layer_test.cpp + ann_regularizer_test.cpp + ann_test_tools.hpp + ann_visitor_test.cpp + armadillo_svd_test.cpp + arma_extend_test.cpp + async_learning_test.cpp + augmented_rnns_tasks_test.cpp + bayesian_linear_regression_test.cpp + bias_svd_test.cpp + binarize_test.cpp + block_krylov_svd_test.cpp + callback_test.cpp + cf_test.cpp + cli_binding_test.cpp + convolutional_network_test.cpp + convolution_test.cpp + cosine_tree_test.cpp + cv_test.cpp + dbscan_test.cpp + dcgan_test.cpp + decision_tree_regressor_test.cpp + decision_tree_test.cpp + det_test.cpp + distribution_test.cpp + drusilla_select_test.cpp + emst_test.cpp + facilities_test.cpp + fastmks_test.cpp + feedforward_network_test.cpp + feedforward_network_2_test.cpp + gan_test.cpp + gmm_test.cpp + hmm_test.cpp + hpt_test.cpp + hoeffding_tree_test.cpp + hyperplane_test.cpp + image_load_test.cpp + imputation_test.cpp + init_rules_test.cpp + io_test.cpp + kde_test.cpp + kernel_pca_test.cpp + kernel_test.cpp + kernel_traits_test.cpp + kfn_test.cpp + kmeans_test.cpp + knn_test.cpp + krann_search_test.cpp + ksinit_test.cpp + lars_test.cpp + layer_names_test.cpp + lin_alg_test.cpp + linear_regression_test.cpp + lmnn_test.cpp + linear_svm_test.cpp load_save_test.cpp - #local_coordinate_coding_test.cpp - #logistic_regression_test.cpp - #log_test.cpp - #loss_functions_test.cpp - #lsh_test.cpp + local_coordinate_coding_test.cpp + logistic_regression_test.cpp + log_test.cpp + loss_functions_test.cpp + lsh_test.cpp main.cpp - #math_test.cpp - #matrix_completion_test.cpp - #maximal_inputs_test.cpp - #metric_test.cpp - #mean_shift_test.cpp - #mock_categorical_data.hpp - #nbc_test.cpp - #nca_test.cpp - #nmf_test.cpp - #nystroem_method_test.cpp - #octree_test.cpp - #one_hot_encoding_test.cpp - #pca_test.cpp - #perceptron_test.cpp - #prefixedoutstream_test.cpp - #python_binding_test.cpp - #qdafn_test.cpp - #quic_svd_test.cpp - #q_learning_test.cpp - #radical_test.cpp - #random_forest_test.cpp - #random_test.cpp - #randomized_svd_test.cpp - #range_search_test.cpp - #rbm_network_test.cpp - #rectangle_tree_test.cpp - #recurrent_network_test.cpp - #rnn_reber_test.cpp - #regularized_svd_test.cpp - #reward_clipping_test.cpp - #rl_components_test.cpp - #scaling_test.cpp - #size_checks_test.cpp - #serialization.cpp - #serialization.hpp - #serialization_test.cpp - #sfinae_test.cpp - #softmax_regression_test.cpp - #sort_policy_test.cpp - #sparse_autoencoder_test.cpp - #sparse_coding_test.cpp - #spill_tree_test.cpp - #split_data_test.cpp - #string_encoding_test.cpp - #sumtree_test.cpp - #svd_batch_test.cpp - #svd_incremental_test.cpp - #svdplusplus_test.cpp - #termination_policy_test.cpp + math_test.cpp + matrix_completion_test.cpp + maximal_inputs_test.cpp + metric_test.cpp + mean_shift_test.cpp + mock_categorical_data.hpp + nbc_test.cpp + nca_test.cpp + nmf_test.cpp + nystroem_method_test.cpp + octree_test.cpp + one_hot_encoding_test.cpp + pca_test.cpp + perceptron_test.cpp + prefixedoutstream_test.cpp + python_binding_test.cpp + qdafn_test.cpp + quic_svd_test.cpp + q_learning_test.cpp + radical_test.cpp + random_forest_test.cpp + random_test.cpp + randomized_svd_test.cpp + range_search_test.cpp + rbm_network_test.cpp + rectangle_tree_test.cpp + recurrent_network_test.cpp + rnn_reber_test.cpp + regularized_svd_test.cpp + reward_clipping_test.cpp + rl_components_test.cpp + scaling_test.cpp + size_checks_test.cpp + serialization.cpp + serialization.hpp + serialization_test.cpp + sfinae_test.cpp + softmax_regression_test.cpp + sort_policy_test.cpp + sparse_autoencoder_test.cpp + sparse_coding_test.cpp + spill_tree_test.cpp + split_data_test.cpp + string_encoding_test.cpp + sumtree_test.cpp + svd_batch_test.cpp + svd_incremental_test.cpp + svdplusplus_test.cpp + termination_policy_test.cpp test_catch_tools.hpp - #test_function_tools.hpp - #timer_test.cpp - #tree_test.cpp - #tree_traits_test.cpp - #ub_tree_test.cpp - #union_find_test.cpp - #vantage_point_tree_test.cpp - #wgan_test.cpp - #main_tests/adaboost_test.cpp - #main_tests/approx_kfn_test.cpp - #main_tests/bayesian_linear_regression_test.cpp - #main_tests/cf_test.cpp - #main_tests/dbscan_test.cpp - #main_tests/decision_tree_test.cpp - #main_tests/det_test.cpp - #main_tests/emst_test.cpp - #main_tests/fastmks_test.cpp - #main_tests/gmm_generate_test.cpp - #main_tests/gmm_probability_test.cpp - #main_tests/gmm_train_test.cpp - #main_tests/hmm_generate_test.cpp - #main_tests/hmm_loglik_test.cpp - #main_tests/hmm_test_utils.hpp - #main_tests/hmm_train_test.cpp - #main_tests/hmm_viterbi_test.cpp - #main_tests/hoeffding_tree_test.cpp - #main_tests/image_converter_test.cpp - #main_tests/kde_test.cpp - #main_tests/kernel_pca_test.cpp - #main_tests/kfn_test.cpp - #main_tests/kmeans_test.cpp - #main_tests/knn_test.cpp - #main_tests/krann_test.cpp - #main_tests/linear_regression_test.cpp - #main_tests/lmnn_test.cpp - #main_tests/linear_svm_test.cpp - #main_tests/local_coordinate_coding_test.cpp - #main_tests/logistic_regression_test.cpp - #main_tests/lsh_test.cpp - #main_tests/mean_shift_test.cpp - #main_tests/nbc_test.cpp - #main_tests/nca_test.cpp - #main_tests/nmf_test.cpp - #main_tests/pca_test.cpp - #main_tests/perceptron_test.cpp - #main_tests/preprocess_binarize_test.cpp - #main_tests/preprocess_imputer_test.cpp - #main_tests/preprocess_one_hot_encode_test.cpp - #main_tests/preprocess_scale_test.cpp - #main_tests/preprocess_split_test.cpp - #main_tests/radical_test.cpp - #main_tests/random_forest_test.cpp - #main_tests/softmax_regression_test.cpp - #main_tests/sparse_coding_test.cpp - #main_tests/range_search_test.cpp - #main_tests/test_helper.hpp + test_function_tools.hpp + timer_test.cpp + tree_test.cpp + tree_traits_test.cpp + ub_tree_test.cpp + union_find_test.cpp + vantage_point_tree_test.cpp + wgan_test.cpp + main_tests/adaboost_test.cpp + main_tests/approx_kfn_test.cpp + main_tests/bayesian_linear_regression_test.cpp + main_tests/cf_test.cpp + main_tests/dbscan_test.cpp + main_tests/decision_tree_test.cpp + main_tests/det_test.cpp + main_tests/emst_test.cpp + main_tests/fastmks_test.cpp + main_tests/gmm_generate_test.cpp + main_tests/gmm_probability_test.cpp + main_tests/gmm_train_test.cpp + main_tests/hmm_generate_test.cpp + main_tests/hmm_loglik_test.cpp + main_tests/hmm_test_utils.hpp + main_tests/hmm_train_test.cpp + main_tests/hmm_viterbi_test.cpp + main_tests/hoeffding_tree_test.cpp + main_tests/image_converter_test.cpp + main_tests/kde_test.cpp + main_tests/kernel_pca_test.cpp + main_tests/kfn_test.cpp + main_tests/kmeans_test.cpp + main_tests/knn_test.cpp + main_tests/krann_test.cpp + main_tests/linear_regression_test.cpp + main_tests/lmnn_test.cpp + main_tests/linear_svm_test.cpp + main_tests/local_coordinate_coding_test.cpp + main_tests/logistic_regression_test.cpp + main_tests/lsh_test.cpp + main_tests/mean_shift_test.cpp + main_tests/nbc_test.cpp + main_tests/nca_test.cpp + main_tests/nmf_test.cpp + main_tests/pca_test.cpp + main_tests/perceptron_test.cpp + main_tests/preprocess_binarize_test.cpp + main_tests/preprocess_imputer_test.cpp + main_tests/preprocess_one_hot_encode_test.cpp + main_tests/preprocess_scale_test.cpp + main_tests/preprocess_split_test.cpp + main_tests/radical_test.cpp + main_tests/random_forest_test.cpp + main_tests/softmax_regression_test.cpp + main_tests/sparse_coding_test.cpp + main_tests/range_search_test.cpp + main_tests/test_helper.hpp ) if(NOT BUILD_SHARED_LIBS) From e117294444a21929c87b5cfde552ad72d798fcbd Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 5 Aug 2021 09:18:57 +0530 Subject: [PATCH 063/112] Implemted trim_if() --- src/mlpack/core/data/load_arff_impl.hpp | 2 +- src/mlpack/core/data/string_algorithms.hpp | 53 +++++++++++++++++++++- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 07c3a9d3f9d..979a27a6d0c 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -105,7 +105,7 @@ void LoadARFF(const std::string& filename, // `origDimType` string here instead (which has not had ::tolower used // on it). types.push_back(true); - boost::trim_if(origDimType, + trim_if(origDimType, [](char c) { return c == '{' || c == '}' || c == ' ' || c == '\t'; diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 2858c0035f1..64b6d92851d 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -18,7 +18,10 @@ namespace data{ /** * A simple trim fucntion to strip off whitespaces - * from both the side of string. + * from both the side of string. If input is a string + * with all spaces then str will be empty string. + * + * @param str string to be trimmed */ inline void trim(std::string& str) { @@ -51,7 +54,53 @@ inline void trim(std::string& str) str = trimmedStr; } - + +/** + * Trim off characters from start and end of + * of the string. The supplied fucntion is + * used to determine which characters will + * be trimmed off. + * + * @param str string to be trimmed + * @param func fucntion to determine the characters which should be trimmed + */ +inline void trim_if(std::string &str, bool (*func)(char)) +{ + if(str.find_first_not_of(' ') == std::string::npos) + { + str = ""; + return; + } + + size_t startIndex = 0; + + for(size_t i = 0; i < str.size(); i++) + { + bool match = func(str[i]); + + if(match) + startIndex++; + else + break; + } + + size_t endIndex = str.size() - 1; + + for(int i = str.size() - 1; i >= 0; i--) + { + bool match = func(str[i]); + if(match) + endIndex--; + else + break; + } + + std::string trimmedStr = (endIndex - startIndex == str.size()) ? + std::move(str) : str.substr(startIndex, endIndex - startIndex + 1); + + str = trimmedStr; +} + } // namespace data } // namespace mlpack From 77b5f6b455ed716ca6e7fa21a8713e9519c1f268 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 5 Aug 2021 09:23:52 +0530 Subject: [PATCH 064/112] Somehow deleted this line, adding it back --- src/mlpack/tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index 33b10aa735d..12ccdbc78bf 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -129,6 +129,7 @@ add_executable(mlpack_test union_find_test.cpp vantage_point_tree_test.cpp wgan_test.cpp + xgboost_test.cpp main_tests/adaboost_test.cpp main_tests/approx_kfn_test.cpp main_tests/bayesian_linear_regression_test.cpp From 1bbae56398342f102e37e9cd2043b599e5db461e Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 6 Aug 2021 16:58:12 +0530 Subject: [PATCH 065/112] Removing comments --- src/mlpack/core/data/detect_file_type.cpp | 3 --- src/mlpack/core/data/load_arff_impl.hpp | 3 --- src/mlpack/core/data/string_algorithms.hpp | 5 ----- src/mlpack/core/data/types.hpp | 2 +- 4 files changed, 1 insertion(+), 12 deletions(-) diff --git a/src/mlpack/core/data/detect_file_type.cpp b/src/mlpack/core/data/detect_file_type.cpp index f51bef087a0..16076a11596 100644 --- a/src/mlpack/core/data/detect_file_type.cpp +++ b/src/mlpack/core/data/detect_file_type.cpp @@ -16,9 +16,6 @@ #include "detect_file_type.hpp" #include "string_algorithms.hpp" -#include -#include - namespace mlpack { namespace data { diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 979a27a6d0c..f59a56f3b1a 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -14,10 +14,7 @@ // In case it hasn't been included yet. #include "load_arff.hpp" - #include "string_algorithms.hpp" -#include - #include "is_naninf.hpp" namespace mlpack { diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 64b6d92851d..3d762845ce1 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -25,11 +25,6 @@ namespace data{ */ inline void trim(std::string& str) { - /*if(str.size() < 2) - { - return; - }*/ - if(str.find_first_not_of(' ') == std::string::npos) { str = ""; diff --git a/src/mlpack/core/data/types.hpp b/src/mlpack/core/data/types.hpp index a54e36a076d..9c145d2fe46 100644 --- a/src/mlpack/core/data/types.hpp +++ b/src/mlpack/core/data/types.hpp @@ -39,7 +39,7 @@ enum struct file_type PGMBinary, //!< Portable Grey Map (greyscale image) PPMBinary, //!< Portable Pixel Map (colour image), used by the field and cube classes HDF5Binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data - CoordASCII //!< simple co-ordinate format for sparse matrices (indices start at zero) + CoordASCII //!< simple co-ordinate format for sparse matrices (indices start at zero) }; /** From 7aca7447e96b25910fe632110314d697027e290e Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 6 Aug 2021 18:37:49 +0530 Subject: [PATCH 066/112] Handling style checks --- src/mlpack/core/data/load_csv.hpp | 181 ++++++------------------- src/mlpack/core/data/load_csv_impl.hpp | 25 +--- src/mlpack/core/data/load_impl.hpp | 6 +- 3 files changed, 47 insertions(+), 165 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 00661e719b5..f2034bc8ea0 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -46,20 +46,13 @@ namespace mlpack { namespace data { /** - * Load the csv file.This class use boost::spirit - * to implement the parser, please refer to following link - * http://theboostcpplibraries.com/boost.spirit for quick review. + * Load the csv file.This class contains fucntions + * to load numeric and categorical data. */ class LoadCSV { public: - char delim; - // Do nothing, just a place holder, to be removed later. - LoadCSV() - { - } - /** * Construct the LoadCSV object on the given file. This will construct the * rules necessary for loading and attempt to open the file. @@ -79,8 +72,6 @@ class LoadCSV } else if(extension == "txt") { - // Can we have a case where number - // of spaces is more than 1 delim = ' '; } @@ -204,11 +195,12 @@ class LoadCSV inFile.clear(); inFile.seekg(0, std::ios::beg); rows = 0; - + while (std::getline(inFile, line)) { - trim(line); ++rows; + // Remove whitespaces from either side + trim(line); if (rows == 1) { // Extract the number of columns. @@ -221,8 +213,8 @@ class LoadCSV if (MapPolicy::NeedsFirstPass) { // In this case we must pass everything we parse to the MapPolicy. - std::string str(line.begin(), line.end()); - + std::string str(line.begin(), line.end()); + std::stringstream line_stream; std::string token; @@ -231,48 +223,31 @@ class LoadCSV break; } - line_stream.clear(); - line_stream.str(line); - - while(line_stream.good()) - { - std::getline(line_stream, token, delim); - trim(token); - - /*size_t found = token.find('"'); + line_stream.clear(); + line_stream.str(line); - if(found != std::string::npos) - { - std::string firstPart = token + ","; - std::string secondPart; + while(line_stream.good()) + { + std::getline(line_stream, token, delim); + // Remove whitespace from either side + trim(token); - std::getline(line_stream, secondPart, delim); - token = firstPart + secondPart; - } - */ + if(token[0] == '"' && token[token.size() - 1] != '"') + { + std::string tok = token; - if(token[0] == '"' && token[token.size() - 1] != '"') - { - /* - token += delim; - std::string part; - std::getline(line_stream, part, delim); - token += part; - */ - std::string tok = token; + while(token[token.size() - 1] != '"') + { + tok += delim; + std::getline(line_stream, token, delim); + tok += token; + } - while(token[token.size() - 1] != '"') - { - tok += delim; - std::getline(line_stream, token, delim); - tok += token; + token = tok; } - token = tok; - } - - info.template MapFirstPass(std::move(token), rows - 1); - } + info.template MapFirstPass(std::move(token), rows - 1); + } } } } @@ -307,7 +282,7 @@ class LoadCSV while (std::getline(inFile, line)) { ++cols; - + // Remove whitespaces from either side trim(line); if (cols == 1) @@ -334,6 +309,7 @@ class LoadCSV // If we need to do a first pass for the DatasetMapper, do it. if (MapPolicy::NeedsFirstPass) { + // In this case we must pass everything we parse to the MapPolicy. size_t dim = 0; std::stringstream line_stream; @@ -350,29 +326,11 @@ class LoadCSV while(line_stream.good()) { std::getline(line_stream, token, delim); - trim(token); - - /*size_t found = token.find('"'); - - if(found != std::string::npos) - { - std::string firstPart = token + ","; - std::string secondPart; - - std::getline(line_stream, secondPart, delim); - token = firstPart + secondPart; - } - */ + // Remove whitespace from either side + trim(token); if(token[0] == '"' && token[token.size() - 1] != '"') { - /* - token += delim; - std::string part; - std::getline(line_stream, part, delim); - token += part; - */ - std::string tok = token; while(token[token.size() - 1] != '"') @@ -434,10 +392,9 @@ class LoadCSV while (std::getline(inFile, line)) { - + // Remove whitespaces from either side trim(line); - const bool canParse = true; std::stringstream line_stream; std::string token; @@ -451,35 +408,17 @@ class LoadCSV while(line_stream.good()) { - if(token == "\t") - { - token.clear(); - } + if(token == "\t") + { + token.clear(); + } std::getline(line_stream, token, delim); + // Remove whitespace from either side trim(token); - /*size_t found = token.find('"'); - - if(found != std::string::npos) - { - std::string firstPart = token + ","; - std::string secondPart; - - std::getline(line_stream, secondPart, delim); - token = firstPart + secondPart; - } - */ - if(token[0] == '"' && token[token.size() - 1] != '"') { - /* - token += delim; - std::string part; - std::getline(line_stream, part, delim); - token += part; - */ - std::string tok = token; while(token[token.size() - 1] != '"') @@ -504,18 +443,6 @@ class LoadCSV throw std::runtime_error(oss.str()); } - // I am not able to understand when can we enter this case. - // I am looking into it, if anyone can give me some hint - // it might help, currently I've assigned canParse as true - // by default - if (!canParse) - { - std::ostringstream oss; - oss << "LoadCSV::NonTransposeParse(): parsing error on line " << col - << "!"; - throw std::runtime_error(oss.str()); - } - ++row; col = 0; } } @@ -545,10 +472,10 @@ class LoadCSV while (std::getline(inFile, line)) { + // Remove whitespaces from either side trim(line); // Reset the row we are looking at. (Remember this is transposed.) row = 0; - const bool canParse = true; std::stringstream line_stream; std::string token; @@ -563,29 +490,11 @@ class LoadCSV while(line_stream.good()) { std::getline(line_stream, token, delim); + // Remove whitespaces from either side trim(token); - /*size_t found = token.find('"'); - - if(found != std::string::npos) - { - std::string firstPart = token + ","; - std::string secondPart; - - std::getline(line_stream, secondPart, delim); - token = firstPart + secondPart; - } - */ - if(token[0] == '"' && token[token.size() - 1] != '"') { - - /* - token += delim; - std::string part; - std::getline(line_stream, part, delim); - token += part; - */ // first part of the string std::string tok = token; @@ -594,7 +503,7 @@ class LoadCSV tok += delim; std::getline(line_stream, token, delim); tok += token; - } + } token = tok; } @@ -612,18 +521,6 @@ class LoadCSV throw std::runtime_error(oss.str()); } - // I am not able to understand when can we enter this case. - // I am looking into it, if anyone can give me some hint - // it might help, currently I've assigned canParser as true - // by default - if (!canParse) - { - std::ostringstream oss; - oss << "LoadCSV::TransposeParse(): parsing error on line " << col - << "!"; - throw std::runtime_error(oss.str()); - } - // Increment the column index. ++col; } diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp index 2d264a741de..27b7627465a 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_csv_impl.hpp @@ -46,7 +46,7 @@ namespace data /** * Given the address of a martix element(val) * sets it equal to the provided value(token) - * example calling: convert_token(x.at(row, col), token) + * example calling: convert_token(x.at(row, col), token) */ template bool LoadCSV::ConvertToken(typename MatType::elem_type& val, const std::string& token) @@ -77,7 +77,7 @@ namespace data ((sig_c == 'f') || (sig_c == 'F'))) { val = neg ? -(std::numeric_limits::infinity()) : - std::numeric_limits::infinity(); + std::numeric_limits::infinity(); return true; } else if (((sig_a == 'n') || (sig_a == 'N')) && @@ -202,25 +202,12 @@ namespace data { std::getline(line_stream, token, delim); - /*size_t found = token.find('"'); - - if(found == std::string::npos) - { - ++line_n_cols; - } - else - { - std::getline(line_stream, token, delim); - ++line_n_cols; - } - */ - if(token[0] == '"' && token[token.size() - 1] != '"') { - while(token[token.size() - 1] != '"') - { - std::getline(line_stream, token, delim); - } + while(token[token.size() - 1] != '"') + { + std::getline(line_stream, token, delim); + } } ++line_n_cols; diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 3dae41adf82..6a8eef65881 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -24,8 +24,6 @@ #include "types.hpp" #include "string_algorithms.hpp" -#include -#include namespace mlpack { namespace data { @@ -163,12 +161,12 @@ bool Load(const std::string& filename, // We can't use the stream if the type is HDF5. bool success; - LoadCSV load; + LoadCSV loader; if (loadType != file_type::HDF5Binary) { if(loadType == file_type::CSVASCII) - success = load.LoadCSVFile(matrix, stream); + success = loader.LoadCSVFile(matrix, stream); else success = matrix.load(stream, ToArmaFileType(loadType)); } From 2a99165d27fb38a7ef97114f0b373d3d819d21f6 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 6 Aug 2021 18:48:35 +0530 Subject: [PATCH 067/112] Replacing old C style callback with std::fucntion --- src/mlpack/core/data/string_algorithms.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 3d762845ce1..fcf6dd726e7 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -59,7 +59,7 @@ inline void trim(std::string& str) * @param str string to be trimmed * @param func fucntion to determine the characters which should be trimmed */ -inline void trim_if(std::string &str, bool (*func)(char)) +inline void trim_if(std::string &str, std::fucntion func) { if(str.find_first_not_of(' ') == std::string::npos) { From 727a02b918851e655c5519df90fa279e841dcaf1 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 6 Aug 2021 19:20:57 +0530 Subject: [PATCH 068/112] Adding back empty constructor | Syntax error in trim_if --- src/mlpack/core/data/load_csv.hpp | 6 ++++++ src/mlpack/core/data/string_algorithms.hpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index f2034bc8ea0..e7f5d14970c 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -53,6 +53,10 @@ class LoadCSV { public: + LoadCSV() + { + // To initialize the class object. + } /** * Construct the LoadCSV object on the given file. This will construct the * rules necessary for loading and attempt to open the file. @@ -535,6 +539,8 @@ class LoadCSV std::string filename; //! Opened stream for reading. std::ifstream inFile; + //! Delimiter char + char delim; }; } // namespace data diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index fcf6dd726e7..389371cdbe3 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -59,7 +59,7 @@ inline void trim(std::string& str) * @param str string to be trimmed * @param func fucntion to determine the characters which should be trimmed */ -inline void trim_if(std::string &str, std::fucntion func) +inline void trim_if(std::string &str, std::function func) { if(str.find_first_not_of(' ') == std::string::npos) { From 62383c0379303693e3b4a5548b1bfa2082630819 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 6 Aug 2021 20:42:03 +0530 Subject: [PATCH 069/112] Adding comment inside constructor. --- src/mlpack/core/data/load_csv.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index e7f5d14970c..5286c5b1f55 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -55,8 +55,10 @@ class LoadCSV LoadCSV() { + // Nothing to do here. // To initialize the class object. } + /** * Construct the LoadCSV object on the given file. This will construct the * rules necessary for loading and attempt to open the file. From 3b5a35ad258a26af6e4348bee1ba86ecc0078ecb Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 12 Aug 2021 07:48:24 +0530 Subject: [PATCH 070/112] Solving indentation issues | changed file_type -> FileType, line_stream -> lineStream, line_string -> lineString | Combining loading logic for dense mat and sparse mat --- src/mlpack/core/data/CMakeLists.txt | 1 + src/mlpack/core/data/detect_file_type.cpp | 72 ++--- src/mlpack/core/data/detect_file_type.hpp | 9 +- src/mlpack/core/data/load.hpp | 14 +- src/mlpack/core/data/load_arff_impl.hpp | 4 +- src/mlpack/core/data/load_csv.hpp | 355 ++++++++++----------- src/mlpack/core/data/load_csv_impl.hpp | 343 ++++++++++---------- src/mlpack/core/data/load_impl.hpp | 36 ++- src/mlpack/core/data/save.hpp | 2 +- src/mlpack/core/data/save_impl.hpp | 24 +- src/mlpack/core/data/string_algorithms.hpp | 28 +- src/mlpack/core/data/types.hpp | 11 +- src/mlpack/core/data/types_impl.hpp | 30 +- 13 files changed, 457 insertions(+), 472 deletions(-) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index fccbd9892c5..b652189abfd 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -10,6 +10,7 @@ set(SOURCES has_serialize.hpp is_naninf.hpp load_csv.hpp + load_csv_impl.hpp load.hpp load_image_impl.hpp load_image.cpp diff --git a/src/mlpack/core/data/detect_file_type.cpp b/src/mlpack/core/data/detect_file_type.cpp index 16076a11596..583112cc0a9 100644 --- a/src/mlpack/core/data/detect_file_type.cpp +++ b/src/mlpack/core/data/detect_file_type.cpp @@ -24,17 +24,17 @@ namespace data { * * @param type Type to get the logical name of. */ -std::string GetStringType(const file_type& type) +std::string GetStringType(const FileType& type) { - switch (type) + switch (type) { - case file_type::CSVASCII: return "CSV data"; - case file_type::RawASCII: return "raw ASCII formatted data"; - case file_type::RawBinary: return "raw binary formatted data"; - case file_type::ArmaASCII: return "Armadillo ASCII formatted data"; - case file_type::ArmaBinary: return "Armadillo binary formatted data"; - case file_type::PGMBinary: return "PGM data"; - case file_type::HDF5Binary: return "HDF5 data"; + case FileType::CSVASCII: return "CSV data"; + case FileType::RawASCII: return "raw ASCII formatted data"; + case FileType::RawBinary: return "raw binary formatted data"; + case FileType::ArmaASCII: return "Armadillo ASCII formatted data"; + case FileType::ArmaBinary: return "Armadillo binary formatted data"; + case FileType::PGMBinary: return "PGM data"; + case FileType::HDF5Binary: return "HDF5 data"; default: return ""; } } @@ -50,7 +50,7 @@ std::string GetStringType(const file_type& type) * * @param f Opened istream to look into to guess the file type. */ -file_type GuessFileType(std::istream& f) +FileType GuessFileType(std::istream& f) { f.clear(); const std::fstream::pos_type pos1 = f.tellg(); @@ -71,7 +71,7 @@ file_type GuessFileType(std::istream& f) // Handle empty files. if (nMax == 0) - return file_type::FileTypeUnknown; + return FileType::FileTypeUnknown; const arma::uword nUse = std::min(nMax, arma::uword(4096)); @@ -89,7 +89,7 @@ file_type GuessFileType(std::istream& f) if (!loadOkay) { delete[] dataMem; - return file_type::FileTypeUnknown; + return FileType::FileTypeUnknown; } bool hasBinary = false; @@ -165,12 +165,12 @@ file_type GuessFileType(std::istream& f) delete[] dataMem; if (hasBinary) - return file_type::RawBinary; + return FileType::RawBinary; if (hasComma && (hasBracket == false)) - return file_type::CSVASCII; + return FileType::CSVASCII; - return file_type::RawASCII; + return FileType::RawASCII; } /** @@ -186,22 +186,22 @@ file_type GuessFileType(std::istream& f) * @param filename Name of the file. * @return The detected file type. */ -file_type AutoDetect(std::fstream& stream, const std::string& filename) +FileType AutoDetect(std::fstream& stream, const std::string& filename) { // Get the extension. std::string extension = Extension(filename); - file_type detectedLoadType = file_type::FileTypeUnknown; + FileType detectedLoadType = FileType::FileTypeUnknown; if (extension == "csv" || extension == "tsv") { detectedLoadType = GuessFileType(stream); - if (detectedLoadType == file_type::CSVASCII) + if (detectedLoadType == FileType::CSVASCII) { if (extension == "tsv") Log::Warn << "'" << filename << "' is comma-separated, not " "tab-separated!" << std::endl; } - else if (detectedLoadType == file_type::RawASCII) // .csv file can be tsv. + else if (detectedLoadType == FileType::RawASCII) // .csv file can be tsv. { if (extension == "csv") { @@ -228,7 +228,7 @@ file_type AutoDetect(std::fstream& stream, const std::string& filename) } else { - detectedLoadType = file_type::FileTypeUnknown; + detectedLoadType = FileType::FileTypeUnknown; } } else if (extension == "txt") @@ -248,15 +248,15 @@ file_type AutoDetect(std::fstream& stream, const std::string& filename) if (rawHeader == ARMA_MAT_TXT) { - detectedLoadType = file_type::ArmaASCII; + detectedLoadType = FileType::ArmaASCII; } else // It's not arma_ascii. Now we let Armadillo guess. { detectedLoadType = GuessFileType(stream); - if (detectedLoadType != file_type::RawASCII && - detectedLoadType != file_type::CSVASCII) - detectedLoadType = file_type::FileTypeUnknown; + if (detectedLoadType != FileType::RawASCII && + detectedLoadType != FileType::CSVASCII) + detectedLoadType = FileType::FileTypeUnknown; } } else if (extension == "bin") @@ -274,25 +274,25 @@ file_type AutoDetect(std::fstream& stream, const std::string& filename) if (rawHeader == ARMA_MAT_BIN) { - detectedLoadType = file_type::ArmaBinary; + detectedLoadType = FileType::ArmaBinary; } else // We can only assume it's raw binary. { - detectedLoadType = file_type::RawBinary; + detectedLoadType = FileType::RawBinary; } } else if (extension == "pgm") { - detectedLoadType = file_type::PGMBinary; + detectedLoadType = FileType::PGMBinary; } else if (extension == "h5" || extension == "hdf5" || extension == "hdf" || extension == "he5") { - detectedLoadType = file_type::HDF5Binary; + detectedLoadType = FileType::HDF5Binary; } else // Unknown extension... { - detectedLoadType = file_type::FileTypeUnknown; + detectedLoadType = FileType::FileTypeUnknown; } return detectedLoadType; @@ -304,34 +304,34 @@ file_type AutoDetect(std::fstream& stream, const std::string& filename) * @param filename Name of the file whose type we should detect. * @return Detected type of file. */ -file_type DetectFromExtension(const std::string& filename) +FileType DetectFromExtension(const std::string& filename) { const std::string extension = Extension(filename); if (extension == "csv") { - return file_type::CSVASCII; + return FileType::CSVASCII; } else if (extension == "txt") { - return file_type::RawASCII; + return FileType::RawASCII; } else if (extension == "bin") { - return file_type::ArmaBinary; + return FileType::ArmaBinary; } else if (extension == "pgm") { - return file_type::PGMBinary; + return FileType::PGMBinary; } else if (extension == "h5" || extension == "hdf5" || extension == "hdf" || extension == "he5") { - return file_type::HDF5Binary; + return FileType::HDF5Binary; } else { - return file_type::FileTypeUnknown; + return FileType::FileTypeUnknown; } } diff --git a/src/mlpack/core/data/detect_file_type.hpp b/src/mlpack/core/data/detect_file_type.hpp index 9394748a292..9c318bfb34f 100644 --- a/src/mlpack/core/data/detect_file_type.hpp +++ b/src/mlpack/core/data/detect_file_type.hpp @@ -25,7 +25,8 @@ namespace data { * * @param type Type to get the logical name of. */ -std::string GetStringType(const file_type& type); +std::string GetStringType(const FileType& type); + /** * Given an istream, attempt to guess the file type. This is taken originally * from Armadillo's function guess_file_type_internal(), but we avoid using @@ -37,7 +38,7 @@ std::string GetStringType(const file_type& type); * * @param f Opened istream to look into to guess the file type. */ -file_type GuessFileType(std::istream& f); +FileType GuessFileType(std::istream& f); /** * Attempt to auto-detect the type of a file given its extension, and by @@ -52,7 +53,7 @@ file_type GuessFileType(std::istream& f); * @param filename Name of the file. * @return The detected file type. arma::file_type_unknown if unknown. */ -file_type AutoDetect(std::fstream& stream, +FileType AutoDetect(std::fstream& stream, const std::string& filename); /** @@ -61,7 +62,7 @@ file_type AutoDetect(std::fstream& stream, * @param filename Name of the file whose type we should detect. * @return Detected type of file. arma::file_type_unknown if unknown. */ -file_type DetectFromExtension(const std::string& filename); +FileType DetectFromExtension(const std::string& filename); } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 79ac6866fe5..d07a3ea69e2 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -71,10 +71,10 @@ namespace data /** Functions to load and save matrices and models. */ { */ template bool Load(const std::string& filename, - MatType& matrix, - const bool fatal = false, - const bool transpose = true, - const file_type inputLoadType = file_type::AutoDetect); + MatType& matrix, + const bool fatal = false, + const bool transpose = true, + const FileType inputLoadType = FileType::AutoDetect); /** * Loads a sparse matrix from file, using arma::coord_ascii format. This @@ -105,11 +105,13 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading (default true). * @return Boolean value indicating success or failure of load. */ +/* template -bool Load(const std::string& filename, - arma::sp_mat& matrix, +bool LoadSparseMatrix(const std::string& filename, + MatType matrix, const bool fatal = false, const bool transpose = true); +*/ /** * Load a column vector from a file, guessing the filetype from the extension. diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index f59a56f3b1a..280eb16cb19 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -102,10 +102,10 @@ void LoadARFF(const std::string& filename, // `origDimType` string here instead (which has not had ::tolower used // on it). types.push_back(true); - trim_if(origDimType, + trim_if(origDimType, [](char c) { - return c == '{' || c == '}' || c == ' ' || c == '\t'; + return c == '{' || c == '}' || c == ' ' || c == '\t'; }); boost::escaped_list_separator sep("\\", ",", "\"'"); diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 5286c5b1f55..fdd946f1237 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -4,11 +4,21 @@ * @author Conrad Sanderson * @author Gopi M. Tatiraju * - * This csv parser is designed by taking reference from armadillo's csv parser. - * In this mlpack's version, all the arma dependencies were removed or replaced - * accordingly, making the parser totally independent of armadillo. + * This csv parser is designed by taking reference from + * armadillo's csv parser. In this mlpack's version, all + * the arma dependencies were removed or replaced + * accordingly, making the parser totally independent of + * armadillo. + * + * As the implementation is inspired from Armadillo it + * is necessary to add two different licenses. One for + * Armadillo and other for mlpack. * * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp + * + * The original Armadillo parser is licensed under the + * BSD-compatible Apache license, shown below: + * * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) * Copyright 2008-2016 National ICT Australia (NICTA) * @@ -60,23 +70,23 @@ class LoadCSV } /** - * Construct the LoadCSV object on the given file. This will construct the - * rules necessary for loading and attempt to open the file. - */ + * Construct the LoadCSV object on the given file. This will construct the + * rules necessary for loading and attempt to open the file. + */ LoadCSV(const std::string& file) : extension(Extension(file)), filename(file), inFile(file) { - if(extension == "csv") + if (extension == "csv") { delim = ','; } - else if(extension == "tsv") + else if (extension == "tsv") { delim = '\t'; } - else if(extension == "txt") + else if (extension == "txt") { delim = ' '; } @@ -85,48 +95,48 @@ class LoadCSV } /** - * Convert the given string token to assigned datatype and assign - * this value to the given address. The address here will be a - * matrix location. - * - * Token is always read as a string, if the given token is +/-INF or NAN - * it converts them to infinity and NAN using numeric_limits. - * - * @param val Token's value will be assigned to this address - * @param token Value which should be assigned - */ + * Convert the given string token to assigned datatype and assign + * this value to the given address. The address here will be a + * matrix location. + * + * Token is always read as a string, if the given token is +/-INF or NAN + * it converts them to infinity and NAN using numeric_limits. + * + * @param val Token's value will be assigned to this address + * @param token Value which should be assigned + */ template bool ConvertToken(typename MatType::elem_type& val, const std::string& token); /** - * Returns a bool value showing whether data was loaded successfully or not. - * - * Parses a csv file and loads the data into a given matrix. In the first pass, - * the function will determine the number of cols and rows in the given file. - * Once the rows and cols are fixed we initialize the matrix with zeros. In - * the second pass, the function converts each value to required datatype - * and sets it equal to val. - * - * This function uses MatType as template parameter in order to provide - * support for any type of matrices from any linear algebra library. - * - * @param x Matrix in which data will be loaded - * @param f File stream to access the data file - */ + * Returns a bool value showing whether data was loaded successfully or not. + * + * Parses a csv file and loads the data into a given matrix. In the first pass, + * the function will determine the number of cols and rows in the given file. + * Once the rows and cols are fixed we initialize the matrix with zeros. In + * the second pass, the function converts each value to required datatype + * and sets it equal to val. + * + * This function uses MatType as template parameter in order to provide + * support for any type of matrices from any linear algebra library. + * + * @param x Matrix in which data will be loaded + * @param f File stream to access the data file + */ template bool LoadCSVFile(MatType& x, std::fstream& f); /** - * Load the file into the given matrix with the given DatasetMapper object. - * Throws exceptions on errors. - * - * @param inout Matrix to load into. - * @param infoSet DatasetMapper to use while loading. - * @param transpose If true, the matrix should be transposed on loading - * (default). - */ - template - void Load(arma::Mat &inout, + * Load the file into the given matrix with the given DatasetMapper object. + * Throws exceptions on errors. + * + * @param inout Matrix to load into. + * @param infoSet DatasetMapper to use while loading. + * @param transpose If true, the matrix should be transposed on loading + * (default). + */ + template + void Load(MatType &inout, DatasetMapper &infoSet, const bool transpose = true) { @@ -135,19 +145,19 @@ class LoadCSV if (transpose) TransposeParse(inout, infoSet); else - NonTransposeParse(inout, infoSet); + NonTransposeParse(inout, infoSet); } /** - * Peek at the file to determine the number of rows and columns in the matrix, - * assuming a non-transposed matrix. This will also take a first pass over - * the data for DatasetMapper, if MapPolicy::NeedsFirstPass is true. The info - * object will be re-initialized with the correct dimensionality. - * - * @param rows Variable to be filled with the number of rows. - * @param cols Variable to be filled with the number of columns. - * @param info DatasetMapper object to use for first pass. - */ + * Peek at the file to determine the number of rows and columns in the matrix, + * assuming a non-transposed matrix. This will also take a first pass over + * the data for DatasetMapper, if MapPolicy::NeedsFirstPass is true. The info + * object will be re-initialized with the correct dimensionality. + * + * @param rows Variable to be filled with the number of rows. + * @param cols Variable to be filled with the number of columns. + * @param info DatasetMapper object to use for first pass. + */ template void GetMatrixSize(size_t& rows, size_t& cols, DatasetMapper& info) { @@ -166,7 +176,7 @@ class LoadCSV std::string line; while (std::getline(inFile, line)) { - ++rows; + ++rows; } // Reset the DatasetInfo object, if needed. @@ -178,8 +188,8 @@ class LoadCSV { std::ostringstream oss; oss << "data::LoadCSV(): given DatasetInfo has dimensionality " - << info.Dimensionality() << ", but data has dimensionality " - << rows; + << info.Dimensionality() << ", but data has dimensionality " + << rows; throw std::invalid_argument(oss.str()); } @@ -192,8 +202,8 @@ class LoadCSV { std::ostringstream oss; oss << "data::LoadCSV(): given DatasetInfo has dimensionality " - << info.Dimensionality() << ", but data has dimensionality " - << rows; + << info.Dimensionality() << ", but data has dimensionality " + << rows; throw std::invalid_argument(oss.str()); } @@ -221,57 +231,52 @@ class LoadCSV // In this case we must pass everything we parse to the MapPolicy. std::string str(line.begin(), line.end()); - std::stringstream line_stream; + std::stringstream lineStream; std::string token; - if(line.size() == 0) - { - break; - } - - line_stream.clear(); - line_stream.str(line); + lineStream.clear(); + lineStream.str(line); - while(line_stream.good()) + while (lineStream.good()) { - std::getline(line_stream, token, delim); + std::getline(lineStream, token, delim); // Remove whitespace from either side - trim(token); + trim(token); - if(token[0] == '"' && token[token.size() - 1] != '"') + if (token[0] == '"' && token[token.size() - 1] != '"') { std::string tok = token; - while(token[token.size() - 1] != '"') - { - tok += delim; - std::getline(line_stream, token, delim); - tok += token; - } + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } - token = tok; - } + token = tok; + } - info.template MapFirstPass(std::move(token), rows - 1); + info.template MapFirstPass(std::move(token), rows - 1); } } } - } + } /** - * Peek at the file to determine the number of rows and columns in the matrix, - * assuming a transposed matrix. This will also take a first pass over the - * data for DatasetMapper, if MapPolicy::NeedsFirstPass is true. The info - * object will be re-initialized with the correct dimensionality. - * - * @param rows Variable to be filled with the number of rows. - * @param cols Variable to be filled with the number of columns. - * @param info DatasetMapper object to use for first pass. - */ + * Peek at the file to determine the number of rows and columns in the matrix, + * assuming a transposed matrix. This will also take a first pass over the + * data for DatasetMapper, if MapPolicy::NeedsFirstPass is true. The info + * object will be re-initialized with the correct dimensionality. + * + * @param rows Variable to be filled with the number of rows. + * @param cols Variable to be filled with the number of columns. + * @param info DatasetMapper object to use for first pass. + */ template void GetTransposeMatrixSize(size_t& rows, - size_t& cols, - DatasetMapper& info) + size_t& cols, + DatasetMapper& info) { // Take a pass through the file. If the DatasetMapper policy requires it, // we will pass everything string through MapString(). This might be useful @@ -306,8 +311,8 @@ class LoadCSV { std::ostringstream oss; oss << "data::LoadCSV(): given DatasetInfo has dimensionality " - << info.Dimensionality() << ", but data has dimensionality " - << rows; + << info.Dimensionality() << ", but data has dimensionality " + << rows; throw std::invalid_argument(oss.str()); } } @@ -315,38 +320,33 @@ class LoadCSV // If we need to do a first pass for the DatasetMapper, do it. if (MapPolicy::NeedsFirstPass) { - // In this case we must pass everything we parse to the MapPolicy. + // In this case we must pass everything we parse to the MapPolicy. size_t dim = 0; - std::stringstream line_stream; - std::string token; - - if(line.size() == 0) - { - break; - } + std::stringstream lineStream; + std::string token; - line_stream.clear(); - line_stream.str(line); + lineStream.clear(); + lineStream.str(line); - while(line_stream.good()) + while (lineStream.good()) { - std::getline(line_stream, token, delim); - // Remove whitespace from either side + std::getline(lineStream, token, delim); + // Remove whitespace from either side trim(token); - - if(token[0] == '"' && token[token.size() - 1] != '"') + + if (token[0] == '"' && token[token.size() - 1] != '"') { - std::string tok = token; + std::string tok = token; - while(token[token.size() - 1] != '"') - { - tok += delim; - std::getline(line_stream, token, delim); - tok += token; + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; } - token = tok; + token = tok; } info.template MapFirstPass(std::move(token), dim++); @@ -356,10 +356,11 @@ class LoadCSV } private: + /** - * Check whether or not the file has successfully opened; throw an exception - * if not. - */ + * Check whether or not the file has successfully opened; throw an exception + * if not. + */ void CheckOpen() { if (!inFile.is_open()) @@ -373,14 +374,14 @@ class LoadCSV } /** - * Parse a non-transposed matrix. - * - * @param inout Matrix to load into. - * @param infoSet DatasetMapper object to load with. - */ + * Parse a non-transposed matrix. + * + * @param inout Matrix to load into. + * @param infoSet DatasetMapper object to load with. + */ template void NonTransposeParse(arma::Mat& inout, - DatasetMapper& infoSet) + DatasetMapper& infoSet) { // Get the size of the matrix. size_t rows, cols; @@ -401,51 +402,45 @@ class LoadCSV // Remove whitespaces from either side trim(line); - std::stringstream line_stream; + std::stringstream lineStream; std::string token; - if(line.size() == 0) - { - break; - } - - line_stream.clear(); - line_stream.str(line); + lineStream.clear(); + lineStream.str(line); - while(line_stream.good()) + while (lineStream.good()) { - if(token == "\t") - { - token.clear(); - } + if (token == "\t") + token.clear(); - std::getline(line_stream, token, delim); - // Remove whitespace from either side + std::getline(lineStream, token, delim); + // Remove whitespace from either side trim(token); - if(token[0] == '"' && token[token.size() - 1] != '"') + if (token[0] == '"' && token[token.size() - 1] != '"') { - std::string tok = token; + std::string tok = token; - while(token[token.size() - 1] != '"') - { - tok += delim; - std::getline(line_stream, token, delim); - tok += token; + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; } - token = tok; + token = tok; } - inout(row, col++) = infoSet.template MapString(std::move(token), row); + inout(row, col++) = infoSet.template MapString(std::move(token), row); } + // Make sure we got the right number of rows. if (col != cols) { std::ostringstream oss; oss << "LoadCSV::NonTransposeParse(): wrong number of dimensions (" - << col << ") on line " << row << "; should be " << cols - << " dimensions."; + << col << ") on line " << row << "; should be " << cols + << " dimensions."; throw std::runtime_error(oss.str()); } @@ -454,12 +449,12 @@ class LoadCSV } /** - * Parse a transposed matrix. - * - * @param inout Matrix to load into. - * @param infoSet DatasetMapper to load with. - */ - template + * Parse a transposed matrix. + * + * @param inout Matrix to load into. + * @param infoSet DatasetMapper to load with. + */ + template void TransposeParse(arma::Mat& inout, DatasetMapper& infoSet) { // Get matrix size. This also initializes infoSet correctly. @@ -482,51 +477,46 @@ class LoadCSV trim(line); // Reset the row we are looking at. (Remember this is transposed.) row = 0; - std::stringstream line_stream; - std::string token; + std::stringstream lineStream; + std::string ; - if(line.size() == 0) - { - break; - } - - line_stream.clear(); - line_stream.str(line); + lineStream.clear(); + lineStream.str(line); - while(line_stream.good()) + while (lineStream.good()) { - std::getline(line_stream, token, delim); + std::getline(lineStream, token, delim); // Remove whitespaces from either side - trim(token); + trim(token); - if(token[0] == '"' && token[token.size() - 1] != '"') + if (token[0] == '"' && token[token.size() - 1] != '"') { - // first part of the string - std::string tok = token; - - while(token[token.size() - 1] != '"') - { - tok += delim; - std::getline(line_stream, token, delim); - tok += token; + // first part of the string + std::string tok = token; + + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; } - token = tok; + token = tok; } inout(row, col) = infoSet.template MapString(std::move(token), row); - row++; + row++; } - + // Make sure we got the right number of rows. if (row != rows) { std::ostringstream oss; oss << "LoadCSV::TransposeParse(): wrong number of dimensions (" << row - << ") on line " << col << "; should be " << rows << " dimensions."; + << ") on line " << col << "; should be " << rows << " dimensions."; throw std::runtime_error(oss.str()); } - + // Increment the column index. ++col; } @@ -535,6 +525,7 @@ class LoadCSV inline std::pair GetMatSize(std::fstream& f, const char delim); inline std::pair GetNonNumericMatSize(std::ifstream& f, const char delim); + //! Extension (type) of file. std::string extension; //! Name of file. diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp index 27b7627465a..196106007f5 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_csv_impl.hpp @@ -42,237 +42,230 @@ namespace mlpack { namespace data { - - /** - * Given the address of a martix element(val) - * sets it equal to the provided value(token) - * example calling: convert_token(x.at(row, col), token) - */ - template - bool LoadCSV::ConvertToken(typename MatType::elem_type& val, const std::string& token) + +/** +* Given the address of a matrix element(val) +* sets it equal to the provided value(token) +* example calling: convert_token(x.at(row, col), token) +*/ +template +bool LoadCSV::ConvertToken(typename MatType::elem_type& val, + const std::string& token) +{ + const size_t N = size_t(token.length()); + + if (N == 0) + { + val = typename MatType::elem_type(0); + return true; + } + + const char* str = token.c_str(); + + if ((N == 3) || (N == 4)) { - const size_t N = size_t(token.length()); + const bool neg = (str[0] == '-'); + const bool pos = (str[0] == '+'); + + const size_t offset = ((neg || pos) && (N == 4)) ? 1 : 0; + + const char sig_a = str[offset]; + const char sig_b = str[offset+1]; + const char sig_c = str[offset+2]; - if (N == 0) + if (((sig_a == 'i') || (sig_a == 'I')) && + ((sig_b == 'n') || (sig_b == 'N')) && + ((sig_c == 'f') || (sig_c == 'F'))) { - val = typename MatType::elem_type(0); + val = neg ? -(std::numeric_limits::infinity()) : + std::numeric_limits::infinity(); return true; } - - const char* str = token.c_str(); - - if ((N == 3) || (N == 4)) + else if (((sig_a == 'n') || (sig_a == 'N')) && + ((sig_b == 'a') || (sig_b == 'A')) && + ((sig_c == 'n') || (sig_c == 'N'))) { - const bool neg = (str[0] == '-'); - const bool pos = (str[0] == '+'); - - const size_t offset = ((neg || pos) && (N == 4)) ? 1 : 0; + val = std::numeric_limits::quiet_NaN(); + return true; + } + } - const char sig_a = str[offset]; - const char sig_b = str[offset+1]; - const char sig_c = str[offset+2]; + char* endptr = nullptr; - if (((sig_a == 'i') || (sig_a == 'I')) && - ((sig_b == 'n') || (sig_b == 'N')) && - ((sig_c == 'f') || (sig_c == 'F'))) - { - val = neg ? -(std::numeric_limits::infinity()) : - std::numeric_limits::infinity(); - return true; - } - else if (((sig_a == 'n') || (sig_a == 'N')) && - ((sig_b == 'a') || (sig_b == 'A')) && - ((sig_c == 'n') || (sig_c == 'N'))) + if (std::is_floating_point::value) + { + val = typename MatType::elem_type(std::strtod(str, &endptr)); + } + else if (std::is_integral::value) + { + if (std::is_signed::value) + val = typename MatType::elem_type(std::strtoll(str, &endptr, 10)); + else + { + if (str[0] == '-') { - val = std::numeric_limits::quiet_NaN(); + val = typename MatType::elem_type(0); return true; } + + val = typename MatType::elem_type( std::strtoull(str, &endptr, 10)); } + } - char* endptr = nullptr; + if (str == endptr) + return false; - if (std::is_floating_point::value) - { - val = typename MatType::elem_type(std::strtod(str, &endptr)); - } - else if (std::is_integral::value) - { - if (std::is_signed::value) - { - val = typename MatType::elem_type(std::strtoll(str, &endptr, 10)); - } - else - { - if (str[0] == '-') - { - val = typename MatType::elem_type(0); - return true; - } + return true; +} - val = typename MatType::elem_type( std::strtoull(str, &endptr, 10)); - } - } +inline std::pair LoadCSV::GetMatSize(std::fstream& f, const char delim = ',') +{ + + bool load_okay = f.good(); + + f.clear(); + + const std::fstream::pos_type pos1 = f.tellg(); + + size_t f_n_rows = 0; + size_t f_n_cols = 0; - if (str == endptr) + std::string lineString; + std::stringstream lineStream; + std::string token; + + while (f.good() && load_okay) + { + std::getline(f, lineString); + if (lineString.size() == 0) + break; + + lineStream.clear(); + lineStream.str(lineString); + + size_t line_n_cols = 0; + + while (lineStream.good()) { - return false; + std::getline(lineStream, token, delim); + ++line_n_cols; } - return true; + + if (f_n_cols < line_n_cols) + f_n_cols = line_n_cols; + + ++f_n_rows; } - inline std::pair LoadCSV::GetMatSize(std::fstream& f, const char delim = ',') - { - - bool load_okay = f.good(); - - f.clear(); - - const std::fstream::pos_type pos1 = f.tellg(); - - size_t f_n_rows = 0; - size_t f_n_cols = 0; - - std::string line_string; - std::stringstream line_stream; - std::string token; - - while (f.good() && load_okay) - { - std::getline(f, line_string); - if (line_string.size() == 0) - { - break; - } - line_stream.clear(); - line_stream.str(line_string); + f.clear(); + f.seekg(pos1); - size_t line_n_cols = 0; + std::pair mat_size(f_n_rows, f_n_cols); - while (line_stream.good()) - { - std::getline(line_stream, token, delim); - ++line_n_cols; - } + return mat_size; +} - if (f_n_cols < line_n_cols) - { - f_n_cols = line_n_cols; - } - ++f_n_rows; - } +inline std::pair LoadCSV::GetNonNumericMatSize(std::ifstream& f, const char delim = ',') +{ + bool load_okay = f.good(); - f.clear(); - f.seekg(pos1); + f.clear(); - std::pair mat_size(f_n_rows, f_n_cols); + const std::fstream::pos_type pos1 = f.tellg(); - return mat_size; - } + size_t f_n_rows = 0; + size_t f_n_cols = 0; + std::string lineString; + std::stringstream lineStream; + std::string token; - inline std::pair LoadCSV::GetNonNumericMatSize(std::ifstream& f, const char delim = ',') + while (f.good() && load_okay) { - bool load_okay = f.good(); - - f.clear(); - - const std::fstream::pos_type pos1 = f.tellg(); - - size_t f_n_rows = 0; - size_t f_n_cols = 0; - - std::string line_string; - std::stringstream line_stream; - std::string token; - - while (f.good() && load_okay) - { - std::getline(f, line_string); - if (line_string.size() == 0) - { - break; - } - line_stream.clear(); - line_stream.str(line_string); + std::getline(f, lineString); + if (lineString.size() == 0) + break; - size_t line_n_cols = 0; - - while (line_stream.good()) - { - std::getline(line_stream, token, delim); + lineStream.clear(); + lineStream.str(lineString); - if(token[0] == '"' && token[token.size() - 1] != '"') - { - while(token[token.size() - 1] != '"') - { - std::getline(line_stream, token, delim); - } - } + size_t line_n_cols = 0; - ++line_n_cols; - } + while (lineStream.good()) + { + std::getline(lineStream, token, delim); - if (f_n_cols < line_n_cols) + if(token[0] == '"' && token[token.size() - 1] != '"') { - f_n_cols = line_n_cols; + while(token[token.size() - 1] != '"') + std::getline(lineStream, token, delim); } - ++f_n_rows; + ++line_n_cols; } - f.clear(); - f.seekg(pos1); + if (f_n_cols < line_n_cols) + f_n_cols = line_n_cols; - std::pair mat_size(f_n_rows, f_n_cols); - - return mat_size; + ++f_n_rows; } - /** - * Returns a bool value showing whether data was loaded successfully or not. - * Parses the file and loads the data into the given matrix. - */ - template - bool LoadCSV::LoadCSVFile(MatType& x, std::fstream& f) - { - bool load_okay = f.good(); - f.clear(); + f.clear(); + f.seekg(pos1); + std::pair mat_size(f_n_rows, f_n_cols); + return mat_size; +} + +/** +* Returns a bool value showing whether data was loaded successfully or not. +* Parses the file and loads the data into the given matrix. +*/ +template +bool LoadCSV::LoadCSVFile(MatType& x, std::fstream& f) +{ + bool load_okay = f.good(); - std::pair mat_size = GetMatSize(f); + f.clear(); - x.zeros(mat_size.first, mat_size.second); + std::pair mat_size = GetMatSize(f); - size_t row = 0; + x.zeros(mat_size.first, mat_size.second); - std::string line_string; - std::stringstream line_stream; - std::string token; + size_t row = 0; - while (f.good()) - { - std::getline(f, line_string); + std::string lineString; + std::stringstream lineStream; + std::string token; - if (line_string.size() == 0) - { - break; - } + while (f.good()) + { + std::getline(f, lineString); - line_stream.clear(); - line_stream.str(line_string); + if (lineString.size() == 0) + break; - size_t col = 0; + lineStream.clear(); + lineStream.str(lineString); - while (line_stream.good()) + size_t col = 0; + + while (lineStream.good()) + { + std::getline(lineStream, token, ','); + typename MatType::elem_type tmp_val = typename MatType::elem_type(0); + + if (ConvertToken(tmp_val, token)) { - std::getline(line_stream, token, ','); - ConvertToken(x.at(row, col), token); + x.at(row, col) = tmp_val; ++col; } - ++row; } - return load_okay; + + ++row; } + return load_okay; +} } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 6a8eef65881..b3449fbe688 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -1,6 +1,7 @@ /** * @file core/data/load_impl.hpp * @author Ryan Curtin + * @author Gopi Tatiraju * * Implementation of templatized load() function defined in load.hpp. * @@ -21,7 +22,6 @@ #include "extension.hpp" #include "detect_file_type.hpp" -#include "types.hpp" #include "string_algorithms.hpp" @@ -86,18 +86,18 @@ bool Load(const std::string& filename, MatType& matrix, const bool fatal, const bool transpose, - const file_type inputLoadType) + const FileType inputLoadType) { Timer::Start("loading_data"); // Catch nonexistent files by opening the stream ourselves. std::fstream stream; -#ifdef _WIN32 // Always open in binary mode on Windows. + #ifdef _WIN32 // Always open in binary mode on Windows. stream.open(filename.c_str(), std::fstream::in | std::fstream::binary); -#else + #else stream.open(filename.c_str(), std::fstream::in); -#endif + #endif if (!stream.is_open()) { Timer::Stop("loading_data"); @@ -110,14 +110,14 @@ bool Load(const std::string& filename, return false; } - file_type loadType = inputLoadType; + FileType loadType = inputLoadType; std::string stringType; - if (inputLoadType == file_type::AutoDetect) + if (inputLoadType == FileType::AutoDetect) { // Attempt to auto-detect the type from the given file. loadType = AutoDetect(stream, filename); // Provide error if we don't know the type. - if (loadType == file_type::FileTypeUnknown) + if (loadType == FileType::FileTypeUnknown) { Timer::Stop("loading_data"); if (fatal) @@ -133,8 +133,8 @@ bool Load(const std::string& filename, stringType = GetStringType(loadType); -#ifndef ARMA_USE_HDF5 - if (inputLoadType == file_type::HDF5Binary) + #ifndef ARMA_USE_HDF5 + if (inputLoadType == FileType::HDF5Binary) { // Ensure that HDF5 is supported. Timer::Stop("loading_data"); @@ -149,10 +149,10 @@ bool Load(const std::string& filename, return false; } -#endif + #endif // Try to load the file; but if it's raw_binary, it could be a problem. - if (loadType == file_type::RawBinary) + if (loadType == FileType::RawBinary) Log::Warn << "Loading '" << filename << "' as " << stringType << "; " << "but this may not be the actual filetype!" << std::endl; else @@ -163,9 +163,9 @@ bool Load(const std::string& filename, bool success; LoadCSV loader; - if (loadType != file_type::HDF5Binary) + if (loadType != FileType::HDF5Binary) { - if(loadType == file_type::CSVASCII) + if (loadType == FileType::CSVASCII) success = loader.LoadCSVFile(matrix, stream); else success = matrix.load(stream, ToArmaFileType(loadType)); @@ -296,10 +296,11 @@ bool Load(const std::string& filename, return true; } +/* // For loading data into sparse matrix template -bool Load(const std::string& filename, - arma::sp_mat& matrix, +bool LoadSparseMatrix(const std::string& filename, + MatType matrix, const bool fatal, const bool transpose) { @@ -328,7 +329,7 @@ bool Load(const std::string& filename, } bool unknownType = false; - arma::file_type loadType; + arma::FileType loadType; std::string stringType; if (extension == "tsv" || extension == "txt") @@ -419,6 +420,7 @@ bool Load(const std::string& filename, // Finally, return the success indicator. return success; } +*/ } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/save.hpp b/src/mlpack/core/data/save.hpp index c36e349dd0e..3487e22726f 100644 --- a/src/mlpack/core/data/save.hpp +++ b/src/mlpack/core/data/save.hpp @@ -65,7 +65,7 @@ bool Save(const std::string& filename, const MatType& matrix, const bool fatal = false, bool transpose = true, - file_type inputSaveType = file_type::AutoDetect); + FileType inputSaveType = FileType::AutoDetect); /** * Saves a sparse matrix to file, guessing the filetype from the diff --git a/src/mlpack/core/data/save_impl.hpp b/src/mlpack/core/data/save_impl.hpp index ce8db3874d5..49d95f90b45 100644 --- a/src/mlpack/core/data/save_impl.hpp +++ b/src/mlpack/core/data/save_impl.hpp @@ -27,7 +27,7 @@ template bool Save(const std::string& filename, const arma::Col& vec, const bool fatal, - file_type inputSaveType) + FileType inputSaveType) { // Don't transpose: one observation per line (for CSVs at least). return Save(filename, vec, fatal, false, inputSaveType); @@ -37,7 +37,7 @@ template bool Save(const std::string& filename, const arma::Row& rowvec, const bool fatal, - file_type inputSaveType) + FileType inputSaveType) { return Save(filename, rowvec, fatal, true, inputSaveType); } @@ -47,18 +47,18 @@ bool Save(const std::string& filename, const MatType& matrix, const bool fatal, bool transpose, - file_type inputSaveType) + FileType inputSaveType) { Timer::Start("saving_data"); - file_type saveType = inputSaveType; + FileType saveType = inputSaveType; std::string stringType = ""; - if (inputSaveType == file_type::AutoDetect) + if (inputSaveType == FileType::AutoDetect) { // Detect the file type using only the extension. saveType = DetectFromExtension(filename); - if (saveType == file_type::FileTypeUnknown) + if (saveType == FileType::FileTypeUnknown) { if (fatal) Log::Fatal << "Could not detect type of file '" << filename << "' for " @@ -104,7 +104,7 @@ bool Save(const std::string& filename, #ifdef ARMA_USE_HDF5 // We can't save with streams for HDF5. - const bool success = (saveType == file_type::HDF5Binary) ? + const bool success = (saveType == FileType::HDF5Binary) ? tmp.quiet_save(filename, ToArmaFileType(saveType)) : tmp.quiet_save(stream, ToArmaFileType(saveType)); #else @@ -125,7 +125,7 @@ bool Save(const std::string& filename, { #ifdef ARMA_USE_HDF5 // We can't save with streams for HDF5. - const bool success = (saveType == file_type::HDF5Binary) ? + const bool success = (saveType == FileType::HDF5Binary) ? matrix.quiet_save(filename, ToArmaFileType(saveType)) : matrix.quiet_save(stream, ToArmaFileType(saveType)); #else @@ -194,23 +194,23 @@ bool Save(const std::string& filename, } bool unknownType = false; - file_type saveType; + FileType saveType; std::string stringType; if (extension == "txt" || extension == "tsv") { - saveType = file_type::CoordASCII; + saveType = FileType::CoordASCII; stringType = "raw ASCII formatted data"; } else if (extension == "bin") { - saveType = file_type::ArmaBinary; + saveType = FileType::ArmaBinary; stringType = "Armadillo binary formatted data"; } else { unknownType = true; - saveType = file_type::RawBinary; // Won't be used; prevent a warning. + saveType = FileType::RawBinary; // Won't be used; prevent a warning. stringType = ""; } diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 389371cdbe3..5d5d44019b6 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -2,7 +2,7 @@ * @file core/data/string_algorithms.hpp * @author Gopi M. Tatiraju * - * Utility fucntions related to string manipulation + * Utility functions related to string manipulation * * mlpack is free software; you may redistribute it and/or modify it under the * terms of the 3-clause BSD license. You should have received a copy of the @@ -25,34 +25,32 @@ namespace data{ */ inline void trim(std::string& str) { - if(str.find_first_not_of(' ') == std::string::npos) + if (str.find_first_not_of(' ') == std::string::npos) { str = ""; return; } + size_t startIndex = 0; - while(std::isspace(str[startIndex])) - { + while (std::isspace(str[startIndex])) startIndex++; - } size_t endIndex = str.size() - 1; - while(std::isspace(str[endIndex])) - { + while (std::isspace(str[endIndex])) endIndex--; - } std::string trimmedStr = (endIndex - startIndex == str.size()) ? - std::move(str) : str.substr(startIndex, endIndex - startIndex + 1); + std::move(str) : str.substr(startIndex, + endIndex - startIndex + 1); str = trimmedStr; } /** * Trim off characters from start and end of - * of the string. The supplied fucntion is + * of the string. The supplied function is * used to determine which characters will * be trimmed off. * @@ -61,7 +59,7 @@ inline void trim(std::string& str) */ inline void trim_if(std::string &str, std::function func) { - if(str.find_first_not_of(' ') == std::string::npos) + if (str.find_first_not_of(' ') == std::string::npos) { str = ""; return; @@ -69,11 +67,11 @@ inline void trim_if(std::string &str, std::function func) size_t startIndex = 0; - for(size_t i = 0; i < str.size(); i++) + for (size_t i = 0; i < str.size(); i++) { bool match = func(str[i]); - if(match) + if (match) startIndex++; else break; @@ -84,14 +82,14 @@ inline void trim_if(std::string &str, std::function func) for(int i = str.size() - 1; i >= 0; i--) { bool match = func(str[i]); - if(match) + if (match) endIndex--; else break; } std::string trimmedStr = (endIndex - startIndex == str.size()) ? - std::move(str) : str.substr(startIndex, endIndex - startIndex + 1); + std::move(str) : str.substr(startIndex, endIndex - startIndex + 1); str = trimmedStr; } diff --git a/src/mlpack/core/data/types.hpp b/src/mlpack/core/data/types.hpp index 9c145d2fe46..0a0de300ff6 100644 --- a/src/mlpack/core/data/types.hpp +++ b/src/mlpack/core/data/types.hpp @@ -27,7 +27,7 @@ namespace mlpack namespace data { -enum struct file_type +enum struct FileType { FileTypeUnknown, AutoDetect, //!< attempt to automatically detect the file type @@ -43,13 +43,12 @@ enum struct file_type }; /** - * This fucntion is used to convert mlpack file type to respective - * arma file type. + * This function is used to convert mlpack file types to + * their respective Armadillo file types. * - * @param type mlpack::file_type + * @param type mlpack::FileType */ -inline arma::file_type ToArmaFileType(const file_type& type); - +inline arma::file_type ToArmaFileType(const FileType& type); } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/types_impl.hpp b/src/mlpack/core/data/types_impl.hpp index 7ebb8e1f897..f3a6eaea674 100644 --- a/src/mlpack/core/data/types_impl.hpp +++ b/src/mlpack/core/data/types_impl.hpp @@ -21,56 +21,54 @@ #include "types.hpp" -namespace mlpack -{ -namespace data -{ +namespace mlpack{ +namespace data{ -inline arma::file_type ToArmaFileType(const file_type& type) +inline arma::file_type ToArmaFileType(const FileType& type) { switch(type) { - case file_type::FileTypeUnknown: + case FileType::FileTypeUnknown: return arma::file_type_unknown; break; - case file_type::AutoDetect: + case FileType::AutoDetect: return arma::auto_detect; break; - case file_type::RawASCII: + case FileType::RawASCII: return arma::raw_ascii; break; - case file_type::ArmaASCII: + case FileType::ArmaASCII: return arma::arma_ascii; break; - case file_type::CSVASCII: + case FileType::CSVASCII: return arma::csv_ascii; break; - case file_type::RawBinary: + case FileType::RawBinary: return arma::raw_binary; break; - case file_type::ArmaBinary: + case FileType::ArmaBinary: return arma::arma_binary; break; - case file_type::PGMBinary: + case FileType::PGMBinary: return arma::pgm_binary; break; - case file_type::PPMBinary: + case FileType::PPMBinary: return arma::ppm_binary; break; - case file_type::HDF5Binary: + case FileType::HDF5Binary: return arma::hdf5_binary; break; - case file_type::CoordASCII: + case FileType::CoordASCII: return arma::coord_ascii; break; From 756d7afbf368c7c16fe171df070ab372fc015243 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 12 Aug 2021 07:56:46 +0530 Subject: [PATCH 071/112] Removing load() for sparse matrix | Solving some errors from last commit --- src/mlpack/core/data/load.hpp | 37 --------- src/mlpack/core/data/load_csv.hpp | 4 +- src/mlpack/core/data/load_impl.hpp | 126 ----------------------------- 3 files changed, 2 insertions(+), 165 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index d07a3ea69e2..11b4f9edc66 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -76,43 +76,6 @@ bool Load(const std::string& filename, const bool transpose = true, const FileType inputLoadType = FileType::AutoDetect); -/** - * Loads a sparse matrix from file, using arma::coord_ascii format. This - * will transpose the matrix at load time (unless the transpose parameter is set - * to false). If the filetype cannot be determined, an error will be given. - * - * The supported types of files are the same as found in Armadillo: - * - * - TSV (coord_ascii), denoted by .tsv or .txt - * - TXT (coord_ascii), denoted by .txt - * - Raw binary (raw_binary), denoted by .bin - * - Armadillo binary (arma_binary), denoted by .bin - * - * If the file extension is not one of those types, an error will be given. - * This is preferable to Armadillo's default behavior of loading an unknown - * filetype as raw_binary, which can have very confusing effects. - * - * If the parameter 'fatal' is set to true, a std::runtime_error exception will - * be thrown if the matrix does not load successfully. The parameter - * 'transpose' controls whether or not the matrix is transposed after loading. - * In most cases, because data is generally stored in a row-major format and - * mlpack requires column-major matrices, this should be left at its default - * value of 'true'. - * - * @param filename Name of file to load. - * @param matrix Sparse matrix to load contents of file into. - * @param fatal If an error should be reported as fatal (default false). - * @param transpose If true, transpose the matrix after loading (default true). - * @return Boolean value indicating success or failure of load. - */ -/* -template -bool LoadSparseMatrix(const std::string& filename, - MatType matrix, - const bool fatal = false, - const bool transpose = true); -*/ - /** * Load a column vector from a file, guessing the filetype from the extension. * diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index fdd946f1237..bffc8c23b09 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -454,7 +454,7 @@ class LoadCSV * @param inout Matrix to load into. * @param infoSet DatasetMapper to load with. */ - template + template void TransposeParse(arma::Mat& inout, DatasetMapper& infoSet) { // Get matrix size. This also initializes infoSet correctly. @@ -478,7 +478,7 @@ class LoadCSV // Reset the row we are looking at. (Remember this is transposed.) row = 0; std::stringstream lineStream; - std::string ; + std::string token; lineStream.clear(); lineStream.str(line); diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index b3449fbe688..b4ca857e3f9 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -296,132 +296,6 @@ bool Load(const std::string& filename, return true; } -/* -// For loading data into sparse matrix -template -bool LoadSparseMatrix(const std::string& filename, - MatType matrix, - const bool fatal, - const bool transpose) -{ - Timer::Start("loading_data"); - - // Get the extension. - std::string extension = Extension(filename); - - // Catch nonexistent files by opening the stream ourselves. - std::fstream stream; -#ifdef _WIN32 // Always open in binary mode on Windows. - stream.open(filename.c_str(), std::fstream::in | std::fstream::binary); -#else - stream.open(filename.c_str(), std::fstream::in); -#endif - if (!stream.is_open()) - { - Timer::Stop("loading_data"); - if (fatal) - Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl; - else - Log::Warn << "Cannot open file '" << filename << "'; load failed." - << std::endl; - - return false; - } - - bool unknownType = false; - arma::FileType loadType; - std::string stringType; - - if (extension == "tsv" || extension == "txt") - { - loadType = arma::coord_ascii; - stringType = "Coordinate Formatted Data for Sparse Matrix"; - } - else if (extension == "bin") - { - // This could be raw binary or Armadillo binary (binary with header). We - // will check to see if it is Armadillo binary. - const std::string ARMA_SPM_BIN = "ARMA_SPM_BIN"; - std::string rawHeader(ARMA_SPM_BIN.length(), '\0'); - - std::streampos pos = stream.tellg(); - - stream.read(&rawHeader[0], std::streamsize(ARMA_SPM_BIN.length())); - stream.clear(); - stream.seekg(pos); // Reset stream position after peeking. - - if (rawHeader == ARMA_SPM_BIN) - { - stringType = "Armadillo binary formatted data for sparse matrix"; - loadType = arma::arma_binary; - } - else // We can only assume it's raw binary. - { - stringType = "raw binary formatted data"; - loadType = arma::raw_binary; - } - } - else // Unknown extension... - { - unknownType = true; - loadType = arma::raw_binary; // Won't be used; prevent a warning. - stringType = ""; - } - - // Provide error if we don't know the type. - if (unknownType) - { - Timer::Stop("loading_data"); - if (fatal) - Log::Fatal << "Unable to detect type of '" << filename << "'; " - << "incorrect extension?" << std::endl; - else - Log::Warn << "Unable to detect type of '" << filename << "'; load failed." - << " Incorrect extension?" << std::endl; - - return false; - } - - // Try to load the file; but if it's raw_binary, it could be a problem. - if (loadType == arma::raw_binary) - Log::Warn << "Loading '" << filename << "' as " << stringType << "; " - << "but this may not be the actual filetype!" << std::endl; - else - Log::Info << "Loading '" << filename << "' as " << stringType << ". " - << std::flush; - - bool success; - - success = matrix.load(stream, loadType); - - if (!success) - { - Log::Info << std::endl; - Timer::Stop("loading_data"); - if (fatal) - Log::Fatal << "Loading from '" << filename << "' failed." << std::endl; - else - Log::Warn << "Loading from '" << filename << "' failed." << std::endl; - - return false; - } - else - Log::Info << "Size is " << (transpose ? matrix.n_cols : matrix.n_rows) - << " x " << (transpose ? matrix.n_rows : matrix.n_cols) << ".\n"; - - // Now transpose the matrix, if necessary. - if (transpose) - { - success = inplace_transpose(matrix, fatal); - } - - Timer::Stop("loading_data"); - - // Finally, return the success indicator. - return success; -} -*/ - } // namespace data } // namespace mlpack From c1cf824a2b283c7c930fd5498726aeeef7192156 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 12 Aug 2021 08:20:35 +0530 Subject: [PATCH 072/112] Chaning template parameter for Load() with DatasetMapper --- src/mlpack/core/data/load.hpp | 4 ++-- src/mlpack/core/data/load_impl.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 11b4f9edc66..4a664a7824c 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -176,9 +176,9 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading. * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, - arma::Mat& matrix, + MatType& matrix, DatasetMapper& info, const bool fatal = false, const bool transpose = true); diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index b4ca857e3f9..d963f92a021 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -201,9 +201,9 @@ bool Load(const std::string& filename, } // Load with mappings. Unfortunately we have to implement this ourselves. -template +template bool Load(const std::string& filename, - arma::Mat& matrix, + MatType& matrix, DatasetMapper& info, const bool fatal, const bool transpose) From 626bbc6d1f40c4225be89f405686dde4c15832e9 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 13 Aug 2021 12:04:34 +0530 Subject: [PATCH 073/112] Combined GetMatSize() and GetNonNumericMatSize() | Created new fucntions NumericParse() and CategoricalParse() --- src/mlpack/core/data/load_csv.hpp | 33 ++++-------- src/mlpack/core/data/load_csv_impl.hpp | 71 +++++++++----------------- 2 files changed, 33 insertions(+), 71 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index bffc8c23b09..c69c8d5fb9d 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -126,6 +126,10 @@ class LoadCSV template bool LoadCSVFile(MatType& x, std::fstream& f); + inline void NumericParse(std::stringstream& lineStream, size_t& col, const char delim); + + inline void CategoricalParse(std::stringstream& lineStream, size_t& col, const char delim); + /** * Load the file into the given matrix with the given DatasetMapper object. * Throws exceptions on errors. @@ -193,20 +197,6 @@ class LoadCSV throw std::invalid_argument(oss.str()); } - // Reset the DatasetInfo object, if needed. - if (info.Dimensionality() == 0) - { - info.SetDimensionality(rows); - } - else if (info.Dimensionality() != rows) - { - std::ostringstream oss; - oss << "data::LoadCSV(): given DatasetInfo has dimensionality " - << info.Dimensionality() << ", but data has dimensionality " - << rows; - throw std::invalid_argument(oss.str()); - } - // Now, jump back to the beginning of the file. inFile.clear(); inFile.seekg(0, std::ios::beg); @@ -220,7 +210,7 @@ class LoadCSV if (rows == 1) { // Extract the number of columns. - std::pair dimen = GetNonNumericMatSize(inFile, delim); + std::pair dimen = GetMatSize(inFile, false, delim); cols = dimen.second; } @@ -274,9 +264,8 @@ class LoadCSV * @param info DatasetMapper object to use for first pass. */ template - void GetTransposeMatrixSize(size_t& rows, - size_t& cols, - DatasetMapper& info) + void GetTransposeMatrixSize(size_t& rows, size_t& cols, + DatasetMapper& info) { // Take a pass through the file. If the DatasetMapper policy requires it, // we will pass everything string through MapString(). This might be useful @@ -299,7 +288,7 @@ class LoadCSV if (cols == 1) { // Extract the number of dimensions. - std::pair dimen = GetNonNumericMatSize(inFile, delim); + std::pair dimen = GetMatSize(inFile, false, delim); rows = dimen.second; // Reset the DatasetInfo object, if needed. @@ -522,16 +511,14 @@ class LoadCSV } } - inline std::pair GetMatSize(std::fstream& f, const char delim); - - inline std::pair GetNonNumericMatSize(std::ifstream& f, const char delim); + inline std::pair GetMatSize(std::fstream& f, const bool isNumeric, const char delim); //! Extension (type) of file. std::string extension; //! Name of file. std::string filename; //! Opened stream for reading. - std::ifstream inFile; + std::fstream inFile; //! Delimiter char char delim; }; diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_csv_impl.hpp index 196106007f5..4854ca5b3d4 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_csv_impl.hpp @@ -118,55 +118,36 @@ bool LoadCSV::ConvertToken(typename MatType::elem_type& val, return true; } -inline std::pair LoadCSV::GetMatSize(std::fstream& f, const char delim = ',') +inline void LoadCSV::NumericParse(std::stringstream& lineStream, size_t& col, const char delim) { - - bool load_okay = f.good(); - - f.clear(); - - const std::fstream::pos_type pos1 = f.tellg(); - - size_t f_n_rows = 0; - size_t f_n_cols = 0; - - std::string lineString; - std::stringstream lineStream; std::string token; - while (f.good() && load_okay) + while (lineStream.good()) { - std::getline(f, lineString); - if (lineString.size() == 0) - break; + std::getline(lineStream, token, delim); + ++col; + } +} - lineStream.clear(); - lineStream.str(lineString); +inline void LoadCSV::CategoricalParse(std::stringstream& lineStream, size_t& col, const char delim) +{ + std::string token; - size_t line_n_cols = 0; + while (lineStream.good()) + { + std::getline(lineStream, token, delim); - while (lineStream.good()) + if (token[0] == '"' && token[token.size() - 1] != '"') { - std::getline(lineStream, token, delim); - ++line_n_cols; + while (token[token.size() - 1] != '"') + std::getline(lineStream, token, delim); } - if (f_n_cols < line_n_cols) - f_n_cols = line_n_cols; - - ++f_n_rows; + ++col; } - - f.clear(); - f.seekg(pos1); - - std::pair mat_size(f_n_rows, f_n_cols); - - return mat_size; } - -inline std::pair LoadCSV::GetNonNumericMatSize(std::ifstream& f, const char delim = ',') +inline std::pair LoadCSV::GetMatSize(std::fstream& f, const bool isNumeric = true, const char delim = ',') { bool load_okay = f.good(); @@ -192,18 +173,10 @@ inline std::pair LoadCSV::GetNonNumericMatSize(std::ifstream& f, size_t line_n_cols = 0; - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - - if(token[0] == '"' && token[token.size() - 1] != '"') - { - while(token[token.size() - 1] != '"') - std::getline(lineStream, token, delim); - } - - ++line_n_cols; - } + if (isNumeric) + NumericParse(lineStream, line_n_cols, delim); + else + CategoricalParse(lineStream, line_n_cols, delim); if (f_n_cols < line_n_cols) f_n_cols = line_n_cols; @@ -213,7 +186,9 @@ inline std::pair LoadCSV::GetNonNumericMatSize(std::ifstream& f, f.clear(); f.seekg(pos1); + std::pair mat_size(f_n_rows, f_n_cols); + return mat_size; } From 4599bc5832866f53d75ae80cb6564ad97882f164 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 13 Aug 2021 14:53:46 +0530 Subject: [PATCH 074/112] Adding MapOnFirstPass() --- src/mlpack/core/data/load_csv.hpp | 107 ++++++++++++------------------ 1 file changed, 42 insertions(+), 65 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index c69c8d5fb9d..0453c2ac67e 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -126,9 +126,47 @@ class LoadCSV template bool LoadCSVFile(MatType& x, std::fstream& f); - inline void NumericParse(std::stringstream& lineStream, size_t& col, const char delim); + inline void NumericParse(std::stringstream& lineStream, size_t& col, + const char delim); - inline void CategoricalParse(std::stringstream& lineStream, size_t& col, const char delim); + inline void CategoricalParse(std::stringstream& lineStream, size_t& col, + const char delim); + +template +void MapOnFirstPass(std::string& line, DatasetMapper& info, const char delim, size_t& dim) +{ + // In this case we must pass everything we parse to the MapPolicy. + std::string str(line.begin(), line.end()); + + std::stringstream lineStream; + std::string token; + + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + // Remove whitespace from either side + trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + std::string tok = token; + + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + + token = tok; + } + + info.template MapFirstPass(std::move(token), dim - 1); + } +} /** * Load the file into the given matrix with the given DatasetMapper object. @@ -217,39 +255,7 @@ class LoadCSV // I guess this is technically a second pass, but that's ok... still the // same idea... if (MapPolicy::NeedsFirstPass) - { - // In this case we must pass everything we parse to the MapPolicy. - std::string str(line.begin(), line.end()); - - std::stringstream lineStream; - std::string token; - - lineStream.clear(); - lineStream.str(line); - - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - // Remove whitespace from either side - trim(token); - - if (token[0] == '"' && token[token.size() - 1] != '"') - { - std::string tok = token; - - while (token[token.size() - 1] != '"') - { - tok += delim; - std::getline(lineStream, token, delim); - tok += token; - } - - token = tok; - } - - info.template MapFirstPass(std::move(token), rows - 1); - } - } + MapOnFirstPass(line, info, delim, rows); } } @@ -309,37 +315,8 @@ class LoadCSV // If we need to do a first pass for the DatasetMapper, do it. if (MapPolicy::NeedsFirstPass) { - // In this case we must pass everything we parse to the MapPolicy. size_t dim = 0; - - std::stringstream lineStream; - std::string token; - - lineStream.clear(); - lineStream.str(line); - - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - // Remove whitespace from either side - trim(token); - - if (token[0] == '"' && token[token.size() - 1] != '"') - { - std::string tok = token; - - while (token[token.size() - 1] != '"') - { - tok += delim; - std::getline(lineStream, token, delim); - tok += token; - } - - token = tok; - } - - info.template MapFirstPass(std::move(token), dim++); - } + MapOnFirstPass(line, info, delim, dim); } } } From 6ee199db0fa340ae5d22ae8c2949a4cdc31776f9 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 13 Aug 2021 15:41:53 +0530 Subject: [PATCH 075/112] Running all tests --- src/mlpack/tests/load_save_test.cpp | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp index 164a0aeac6a..44f09fd8700 100644 --- a/src/mlpack/tests/load_save_test.cpp +++ b/src/mlpack/tests/load_save_test.cpp @@ -85,7 +85,7 @@ TEST_CASE("WrongExtensionCorrectLoad", "[LoadSaveTest]") // Now reload through our interface. REQUIRE( - data::Load("test_file.csv", test, false, true, file_type::ArmaBinary) + data::Load("test_file.csv", test, false, true, FileType::ArmaBinary) == true); REQUIRE(test.n_rows == 4); @@ -127,7 +127,6 @@ TEST_CASE("LoadCSVTest", "[LoadSaveTest]") /** * Make sure a TSV is loaded correctly to a sparse matrix. */ -/* TEST_CASE("LoadSparseTSVTest", "[LoadSaveTest]") { fstream f; @@ -164,13 +163,10 @@ TEST_CASE("LoadSparseTSVTest", "[LoadSaveTest]") // Remove the file. remove("test_sparse_file.tsv"); } -*/ - /** * Make sure a CSV in text format is loaded correctly to a sparse matrix. */ -/* TEST_CASE("LoadSparseTXTTest", "[LoadSaveTest]") { fstream f; @@ -206,7 +202,7 @@ TEST_CASE("LoadSparseTXTTest", "[LoadSaveTest]") // Remove the file. remove("test_sparse_file.txt"); } -*/ + /** * Make sure a TSV is loaded correctly. */ @@ -273,7 +269,7 @@ TEST_CASE("LoadAnyExtensionFileTest", "[LoadSaveTest]") f.close(); arma::mat test; - REQUIRE(data::Load("test_file.blah", test, false, true, file_type::RawASCII)); + REQUIRE(data::Load("test_file.blah", test, false, true, FileType::RawASCII)); REQUIRE(test.n_rows == 4); REQUIRE(test.n_cols == 2); @@ -314,7 +310,6 @@ TEST_CASE("SaveCSVTest", "[LoadSaveTest]") /** * Make sure a TSV is saved correctly for a sparse matrix */ -/* TEST_CASE("SaveSparseTSVTest", "[LoadSaveTest]") { arma::sp_mat test = "0.1\t0\t0\t0;" @@ -346,11 +341,10 @@ TEST_CASE("SaveSparseTSVTest", "[LoadSaveTest]") // Remove the file. remove("test_sparse_file.tsv"); } -*/ + /** * Make sure a TXT is saved correctly for a sparse matrix */ -/* TEST_CASE("SaveSparseTXTTest", "[LoadSaveTest]") { arma::sp_mat test = "0.1 0 0 0;" @@ -382,11 +376,10 @@ TEST_CASE("SaveSparseTXTTest", "[LoadSaveTest]") // Remove the file. remove("test_sparse_file.txt"); } -*/ + /** * Make sure a Sparse Matrix is saved and loaded correctly in binary format */ -/* TEST_CASE("SaveSparseBinaryTest", "[LoadSaveTest]") { arma::sp_mat test = "0.1 0 0 0;" @@ -418,7 +411,7 @@ TEST_CASE("SaveSparseBinaryTest", "[LoadSaveTest]") // Remove the file. remove("test_sparse_file.bin"); } -*/ + /** * Make sure CSVs can be loaded in transposed form. */ @@ -986,10 +979,10 @@ TEST_CASE("SaveArmaBinaryArbitraryExtensionTest", "[LoadSaveTest]") "4 8;"; REQUIRE(data::Save("test_file.blerp.blah", test, false, true, - file_type::ArmaBinary) == true); + FileType::ArmaBinary) == true); REQUIRE(data::Load("test_file.blerp.blah", test, false, true, - file_type::ArmaBinary) == true); + FileType::ArmaBinary) == true); REQUIRE(test.n_rows == 4); REQUIRE(test.n_cols == 2); From a2a352d3123e8daae4490a8e163380039fe10cb5 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 13 Aug 2021 20:04:50 +0530 Subject: [PATCH 076/112] Checking --- src/mlpack/core/data/load_csv.hpp | 132 +++++++++++++++++++++++------- 1 file changed, 102 insertions(+), 30 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 0453c2ac67e..ef494ce4956 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -132,41 +132,41 @@ class LoadCSV inline void CategoricalParse(std::stringstream& lineStream, size_t& col, const char delim); -template -void MapOnFirstPass(std::string& line, DatasetMapper& info, const char delim, size_t& dim) -{ - // In this case we must pass everything we parse to the MapPolicy. - std::string str(line.begin(), line.end()); + template + void MapOnFirstPass(std::string& line, DatasetMapper& info, const char delim, size_t& dim) + { + // In this case we must pass everything we parse to the MapPolicy. + std::string str(line.begin(), line.end()); - std::stringstream lineStream; - std::string token; + std::stringstream lineStream; + std::string token; - lineStream.clear(); - lineStream.str(line); + lineStream.clear(); + lineStream.str(line); - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - // Remove whitespace from either side - trim(token); + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + // Remove whitespace from either side + trim(token); - if (token[0] == '"' && token[token.size() - 1] != '"') - { - std::string tok = token; - - while (token[token.size() - 1] != '"') + if (token[0] == '"' && token[token.size() - 1] != '"') { - tok += delim; - std::getline(lineStream, token, delim); - tok += token; + std::string tok = token; + + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + + token = tok; } - token = tok; + info.template MapFirstPass(std::move(token), dim - 1); } - - info.template MapFirstPass(std::move(token), dim - 1); } -} /** * Load the file into the given matrix with the given DatasetMapper object. @@ -182,6 +182,20 @@ void MapOnFirstPass(std::string& line, DatasetMapper& info, const cha DatasetMapper &infoSet, const bool transpose = true) { + std::cout << "This is extension: " << extension << "\n"; + if (extension == "csv") + { + delim = ','; + } + else if (extension == "tsv") + { + delim = '\t'; + } + else if (extension == "txt") + { + delim = ' '; + } + CheckOpen(); if (transpose) @@ -255,10 +269,40 @@ void MapOnFirstPass(std::string& line, DatasetMapper& info, const cha // I guess this is technically a second pass, but that's ok... still the // same idea... if (MapPolicy::NeedsFirstPass) - MapOnFirstPass(line, info, delim, rows); + { + std::string str(line.begin(), line.end()); + + std::stringstream lineStream; + std::string token; + + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + // Remove whitespace from either side + trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + std::string tok = token; + + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + + token = tok; + } + + info.template MapFirstPass(std::move(token), rows - 1); + } } } - +} /** * Peek at the file to determine the number of rows and columns in the matrix, * assuming a transposed matrix. This will also take a first pass over the @@ -297,7 +341,6 @@ void MapOnFirstPass(std::string& line, DatasetMapper& info, const cha std::pair dimen = GetMatSize(inFile, false, delim); rows = dimen.second; - // Reset the DatasetInfo object, if needed. if (info.Dimensionality() == 0) { info.SetDimensionality(rows); @@ -315,8 +358,37 @@ void MapOnFirstPass(std::string& line, DatasetMapper& info, const cha // If we need to do a first pass for the DatasetMapper, do it. if (MapPolicy::NeedsFirstPass) { + // In this case we must pass everything we parse to the MapPolicy. size_t dim = 0; - MapOnFirstPass(line, info, delim, dim); + + std::stringstream lineStream; + std::string token; + + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + // Remove whitespace from either side + trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + std::string tok = token; + + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + + token = tok; + } + + info.template MapFirstPass(std::move(token), dim++); + } } } } From ce0a90473a96de86dc23c288f38321944e61666f Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Fri, 13 Aug 2021 23:53:17 +0530 Subject: [PATCH 077/112] Refactoring parser into two files --- src/mlpack/core/data/CMakeLists.txt | 3 +- src/mlpack/core/data/load_categorical_csv.hpp | 361 ++++++++++++++ src/mlpack/core/data/load_csv.hpp | 457 ++++-------------- src/mlpack/core/data/load_impl.hpp | 4 +- ...load_csv_impl.hpp => load_numeric_csv.hpp} | 135 +----- src/mlpack/tests/CMakeLists.txt | 346 ++++++------- 6 files changed, 644 insertions(+), 662 deletions(-) create mode 100644 src/mlpack/core/data/load_categorical_csv.hpp rename src/mlpack/core/data/{load_csv_impl.hpp => load_numeric_csv.hpp} (50%) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index b652189abfd..33f0c23dabf 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -10,7 +10,8 @@ set(SOURCES has_serialize.hpp is_naninf.hpp load_csv.hpp - load_csv_impl.hpp + load_numeric_csv.hpp + load_categorical_csv.hpp load.hpp load_image_impl.hpp load_image.cpp diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp new file mode 100644 index 00000000000..ac011422608 --- /dev/null +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -0,0 +1,361 @@ +/** + * @file core/data/load_categorical_csv.hpp + * @author Gopi Tatiraju + * + * Load a matrix from file. Matrix may contain categorical data. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef MLPACK_CORE_DATA_LOAD_CATEGORICAL_CSV_HPP +#define MLPACK_CORE_DATA_LOAD_CATEGORICAL_CSV_HPP + +#include "load_csv.hpp" + +namespace mlpack{ +namespace data{ + +template +void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, + DatasetMapper& info) +{ + // Take a pass through the file. If the DatasetMapper policy requires it, + // we will pass everything string through MapString(). This might be useful + // if, e.g., the MapPolicy needs to find which dimensions are numeric or + // categorical. + + // Reset to the start of the file. + inFile.clear(); + inFile.seekg(0, std::ios::beg); + rows = 0; + cols = 0; + + std::string line; + while (std::getline(inFile, line)) + { + ++cols; + // Remove whitespaces from either side + trim(line); + + if (cols == 1) + { + // Extract the number of dimensions. + std::pair dimen = GetMatrixSize(inFile, false, delim); + rows = dimen.second; + + if (info.Dimensionality() == 0) + { + info.SetDimensionality(rows); + } + else if (info.Dimensionality() != rows) + { + std::ostringstream oss; + oss << "data::LoadCSV(): given DatasetInfo has dimensionality " + << info.Dimensionality() << ", but data has dimensionality " + << rows; + throw std::invalid_argument(oss.str()); + } + } + + // If we need to do a first pass for the DatasetMapper, do it. + if (MapPolicy::NeedsFirstPass) + { + // In this case we must pass everything we parse to the MapPolicy. + size_t dim = 0; + + std::stringstream lineStream; + std::string token; + + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + // Remove whitespace from either side + trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + std::string tok = token; + + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + + token = tok; + } + + info.template MapFirstPass(std::move(token), dim++); + } + } + } +} + +template +void LoadCSV::InitializeMapper(size_t& rows, size_t& cols, DatasetMapper& info) +{ + // Take a pass through the file. If the DatasetMapper policy requires it, + // we will pass everything string through MapString(). This might be useful + // if, e.g., the MapPolicy needs to find which dimensions are numeric or + // categorical. + + // Reset to the start of the file. + inFile.clear(); + inFile.seekg(0, std::ios::beg); + rows = 0; + cols = 0; + + // First, count the number of rows in the file (this is the dimensionality). + std::string line; + while (std::getline(inFile, line)) + ++rows; + + // Reset the DatasetInfo object, if needed. + if (info.Dimensionality() == 0) + { + info.SetDimensionality(rows); + } + else if (info.Dimensionality() != rows) + { + std::ostringstream oss; + oss << "data::LoadCSV(): given DatasetInfo has dimensionality " + << info.Dimensionality() << ", but data has dimensionality " + << rows; + throw std::invalid_argument(oss.str()); + } + + // Now, jump back to the beginning of the file. + inFile.clear(); + inFile.seekg(0, std::ios::beg); + rows = 0; + + while (std::getline(inFile, line)) + { + ++rows; + // Remove whitespaces from either side + trim(line); + if (rows == 1) + { + // Extract the number of columns. + std::pair dimen = GetMatrixSize(inFile, false, delim); + cols = dimen.second; + } + + // I guess this is technically a second pass, but that's ok... still the + // same idea... + if (MapPolicy::NeedsFirstPass) + { + std::string str(line.begin(), line.end()); + + std::stringstream lineStream; + std::string token; + + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + // Remove whitespace from either side + trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + std::string tok = token; + + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + + token = tok; + } + + info.template MapFirstPass(std::move(token), rows - 1); + } + } + } +} + +template +void LoadCSV::TransposeParse(arma::Mat& inout, DatasetMapper& infoSet) +{ + // Get matrix size. This also initializes infoSet correctly. + size_t rows, cols; + InitializeTransposeMapper(rows, cols, infoSet); + + // Set the matrix size. + inout.set_size(rows, cols); + + // Initialize auxiliary variables. + size_t row = 0; + size_t col = 0; + std::string line; + inFile.clear(); + inFile.seekg(0, std::ios::beg); + + while (std::getline(inFile, line)) + { + // Remove whitespaces from either side + trim(line); + // Reset the row we are looking at. (Remember this is transposed.) + row = 0; + std::stringstream lineStream; + std::string token; + + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + // Remove whitespaces from either side + trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + // first part of the string + std::string tok = token; + + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + + token = tok; + } + + inout(row, col) = infoSet.template MapString(std::move(token), row); + row++; + } + + // Make sure we got the right number of rows. + if (row != rows) + { + std::ostringstream oss; + oss << "LoadCSV::TransposeParse(): wrong number of dimensions (" << row + << ") on line " << col << "; should be " << rows << " dimensions."; + throw std::runtime_error(oss.str()); + } + + // Increment the column index. + ++col; + } +} + +template +void LoadCSV::NonTransposeParse(arma::Mat& inout, + DatasetMapper& infoSet) +{ + // Get the size of the matrix. + size_t rows, cols; + InitializeMapper(rows, cols, infoSet); + + // Set up output matrix. + inout.set_size(rows, cols); + size_t row = 0; + size_t col = 0; + + // Reset file position. + std::string line; + inFile.clear(); + inFile.seekg(0, std::ios::beg); + + while (std::getline(inFile, line)) + { + // Remove whitespaces from either side + trim(line); + + std::stringstream lineStream; + std::string token; + + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + if (token == "\t") + token.clear(); + + std::getline(lineStream, token, delim); + // Remove whitespace from either side + trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + std::string tok = token; + + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + + token = tok; + } + + inout(row, col++) = infoSet.template MapString(std::move(token), row); + } + + // Make sure we got the right number of rows. + if (col != cols) + { + std::ostringstream oss; + oss << "LoadCSV::NonTransposeParse(): wrong number of dimensions (" + << col << ") on line " << row << "; should be " << cols + << " dimensions."; + throw std::runtime_error(oss.str()); + } + + ++row; col = 0; + } +} + +template +void LoadCSV::LoadCategoricalCSV(MatType &inout, + DatasetMapper &infoSet, + const bool transpose) +{ + CheckOpen(); + + if (transpose) + TransposeParse(inout, infoSet); + else + NonTransposeParse(inout, infoSet); +} + + + +inline void LoadCSV::CategoricalMatSize(std::stringstream& lineStream, size_t& col, const char delim) +{ + std::string token; + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + while (token[token.size() - 1] != '"') + std::getline(lineStream, token, delim); + } + + ++col; + } +} + +} //namespace data +} //namespace mlpack + +#endif diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index ef494ce4956..11a2490efa5 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -94,19 +94,7 @@ class LoadCSV CheckOpen(); } - /** - * Convert the given string token to assigned datatype and assign - * this value to the given address. The address here will be a - * matrix location. - * - * Token is always read as a string, if the given token is +/-INF or NAN - * it converts them to infinity and NAN using numeric_limits. - * - * @param val Token's value will be assigned to this address - * @param token Value which should be assigned - */ - template - bool ConvertToken(typename MatType::elem_type& val, const std::string& token); + // Fucntions for Numeric Parser /** * Returns a bool value showing whether data was loaded successfully or not. @@ -124,49 +112,27 @@ class LoadCSV * @param f File stream to access the data file */ template - bool LoadCSVFile(MatType& x, std::fstream& f); + bool LoadNumericCSV(MatType& x, std::fstream& f); - inline void NumericParse(std::stringstream& lineStream, size_t& col, - const char delim); - - inline void CategoricalParse(std::stringstream& lineStream, size_t& col, - const char delim); - - template - void MapOnFirstPass(std::string& line, DatasetMapper& info, const char delim, size_t& dim) - { - // In this case we must pass everything we parse to the MapPolicy. - std::string str(line.begin(), line.end()); - - std::stringstream lineStream; - std::string token; - - lineStream.clear(); - lineStream.str(line); - - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - // Remove whitespace from either side - trim(token); - - if (token[0] == '"' && token[token.size() - 1] != '"') - { - std::string tok = token; + /** + * Convert the given string token to assigned datatype and assign + * this value to the given address. The address here will be a + * matrix location. + * + * Token is always read as a string, if the given token is +/-INF or NAN + * it converts them to infinity and NAN using numeric_limits. + * + * @param val Token's value will be assigned to this address + * @param token Value which should be assigned + */ + template + bool ConvertToken(typename MatType::elem_type& val, const std::string& token); - while (token[token.size() - 1] != '"') - { - tok += delim; - std::getline(lineStream, token, delim); - tok += token; - } - token = tok; - } + inline void NumericMatSize(std::stringstream& lineStream, size_t& col, + const char delim); - info.template MapFirstPass(std::move(token), dim - 1); - } - } + // Functions for Categorical Parse /** * Load the file into the given matrix with the given DatasetMapper object. @@ -178,31 +144,9 @@ class LoadCSV * (default). */ template - void Load(MatType &inout, - DatasetMapper &infoSet, - const bool transpose = true) - { - std::cout << "This is extension: " << extension << "\n"; - if (extension == "csv") - { - delim = ','; - } - else if (extension == "tsv") - { - delim = '\t'; - } - else if (extension == "txt") - { - delim = ' '; - } - - CheckOpen(); - - if (transpose) - TransposeParse(inout, infoSet); - else - NonTransposeParse(inout, infoSet); - } + void LoadCategoricalCSV(MatType &inout, + DatasetMapper &infoSet, + const bool transpose = true); /** * Peek at the file to determine the number of rows and columns in the matrix, @@ -215,94 +159,9 @@ class LoadCSV * @param info DatasetMapper object to use for first pass. */ template - void GetMatrixSize(size_t& rows, size_t& cols, DatasetMapper& info) - { - // Take a pass through the file. If the DatasetMapper policy requires it, - // we will pass everything string through MapString(). This might be useful - // if, e.g., the MapPolicy needs to find which dimensions are numeric or - // categorical. - - // Reset to the start of the file. - inFile.clear(); - inFile.seekg(0, std::ios::beg); - rows = 0; - cols = 0; - - // First, count the number of rows in the file (this is the dimensionality). - std::string line; - while (std::getline(inFile, line)) - { - ++rows; - } - - // Reset the DatasetInfo object, if needed. - if (info.Dimensionality() == 0) - { - info.SetDimensionality(rows); - } - else if (info.Dimensionality() != rows) - { - std::ostringstream oss; - oss << "data::LoadCSV(): given DatasetInfo has dimensionality " - << info.Dimensionality() << ", but data has dimensionality " - << rows; - throw std::invalid_argument(oss.str()); - } + void InitializeMapper(size_t& rows, size_t& cols, + DatasetMapper& info); - // Now, jump back to the beginning of the file. - inFile.clear(); - inFile.seekg(0, std::ios::beg); - rows = 0; - - while (std::getline(inFile, line)) - { - ++rows; - // Remove whitespaces from either side - trim(line); - if (rows == 1) - { - // Extract the number of columns. - std::pair dimen = GetMatSize(inFile, false, delim); - cols = dimen.second; - } - - // I guess this is technically a second pass, but that's ok... still the - // same idea... - if (MapPolicy::NeedsFirstPass) - { - std::string str(line.begin(), line.end()); - - std::stringstream lineStream; - std::string token; - - lineStream.clear(); - lineStream.str(line); - - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - // Remove whitespace from either side - trim(token); - - if (token[0] == '"' && token[token.size() - 1] != '"') - { - std::string tok = token; - - while (token[token.size() - 1] != '"') - { - tok += delim; - std::getline(lineStream, token, delim); - tok += token; - } - - token = tok; - } - - info.template MapFirstPass(std::move(token), rows - 1); - } - } - } -} /** * Peek at the file to determine the number of rows and columns in the matrix, * assuming a transposed matrix. This will also take a first pass over the @@ -314,85 +173,68 @@ class LoadCSV * @param info DatasetMapper object to use for first pass. */ template - void GetTransposeMatrixSize(size_t& rows, size_t& cols, - DatasetMapper& info) - { - // Take a pass through the file. If the DatasetMapper policy requires it, - // we will pass everything string through MapString(). This might be useful - // if, e.g., the MapPolicy needs to find which dimensions are numeric or - // categorical. - - // Reset to the start of the file. - inFile.clear(); - inFile.seekg(0, std::ios::beg); - rows = 0; - cols = 0; - - std::string line; - while (std::getline(inFile, line)) - { - ++cols; - // Remove whitespaces from either side - trim(line); - - if (cols == 1) - { - // Extract the number of dimensions. - std::pair dimen = GetMatSize(inFile, false, delim); - rows = dimen.second; - - if (info.Dimensionality() == 0) - { - info.SetDimensionality(rows); - } - else if (info.Dimensionality() != rows) - { - std::ostringstream oss; - oss << "data::LoadCSV(): given DatasetInfo has dimensionality " - << info.Dimensionality() << ", but data has dimensionality " - << rows; - throw std::invalid_argument(oss.str()); - } - } + void InitializeTransposeMapper(size_t& rows, size_t& cols, + DatasetMapper& info); - // If we need to do a first pass for the DatasetMapper, do it. - if (MapPolicy::NeedsFirstPass) - { - // In this case we must pass everything we parse to the MapPolicy. - size_t dim = 0; + inline void CategoricalMatSize(std::stringstream& lineStream, size_t& col, + const char delim); - std::stringstream lineStream; - std::string token; + // Functions common to both numeric & categorical parser - lineStream.clear(); - lineStream.str(line); - - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - // Remove whitespace from either side - trim(token); - - if (token[0] == '"' && token[token.size() - 1] != '"') - { - std::string tok = token; - - while (token[token.size() - 1] != '"') - { - tok += delim; - std::getline(lineStream, token, delim); - tok += token; - } - - token = tok; - } - - info.template MapFirstPass(std::move(token), dim++); - } - } + /** + * Get the size of the matrix. Based on isNumeric the fucntion can be used + * for both numeric_parse and categorical_parse. + * + * @param f fstream stream to open the data file + * @param isNumeric bool to ecide if data is numeric or categorical + * @param delim char delimiter charecter + */ + inline std::pair GetMatrixSize(std::fstream& f, const bool isNumeric = true, const char delim = ',') + { + bool load_okay = f.good(); + + f.clear(); + + const std::fstream::pos_type pos1 = f.tellg(); + + size_t f_n_rows = 0; + size_t f_n_cols = 0; + + std::string lineString; + std::stringstream lineStream; + std::string token; + + while (f.good() && load_okay) + { + std::getline(f, lineString); + if (lineString.size() == 0) + break; + + lineStream.clear(); + lineStream.str(lineString); + + size_t line_n_cols = 0; + + if (isNumeric) + NumericMatSize(lineStream, line_n_cols, delim); + else + CategoricalMatSize(lineStream, line_n_cols, delim); + + if (f_n_cols < line_n_cols) + f_n_cols = line_n_cols; + + ++f_n_rows; } + + f.clear(); + f.seekg(pos1); + + std::pair mat_size(f_n_rows, f_n_cols); + + return mat_size; } + private: /** @@ -411,6 +253,8 @@ class LoadCSV inFile.unsetf(std::ios::skipws); } + // Fucntions for Categorical Parse + /** * Parse a non-transposed matrix. * @@ -419,72 +263,7 @@ class LoadCSV */ template void NonTransposeParse(arma::Mat& inout, - DatasetMapper& infoSet) - { - // Get the size of the matrix. - size_t rows, cols; - GetMatrixSize(rows, cols, infoSet); - - // Set up output matrix. - inout.set_size(rows, cols); - size_t row = 0; - size_t col = 0; - - // Reset file position. - std::string line; - inFile.clear(); - inFile.seekg(0, std::ios::beg); - - while (std::getline(inFile, line)) - { - // Remove whitespaces from either side - trim(line); - - std::stringstream lineStream; - std::string token; - - lineStream.clear(); - lineStream.str(line); - - while (lineStream.good()) - { - if (token == "\t") - token.clear(); - - std::getline(lineStream, token, delim); - // Remove whitespace from either side - trim(token); - - if (token[0] == '"' && token[token.size() - 1] != '"') - { - std::string tok = token; - - while (token[token.size() - 1] != '"') - { - tok += delim; - std::getline(lineStream, token, delim); - tok += token; - } - - token = tok; - } - - inout(row, col++) = infoSet.template MapString(std::move(token), row); - } - - // Make sure we got the right number of rows. - if (col != cols) - { - std::ostringstream oss; - oss << "LoadCSV::NonTransposeParse(): wrong number of dimensions (" - << col << ") on line " << row << "; should be " << cols - << " dimensions."; - throw std::runtime_error(oss.str()); - } - - ++row; col = 0; - } - } + DatasetMapper& infoSet); /** * Parse a transposed matrix. @@ -493,74 +272,7 @@ class LoadCSV * @param infoSet DatasetMapper to load with. */ template - void TransposeParse(arma::Mat& inout, DatasetMapper& infoSet) - { - // Get matrix size. This also initializes infoSet correctly. - size_t rows, cols; - GetTransposeMatrixSize(rows, cols, infoSet); - - // Set the matrix size. - inout.set_size(rows, cols); - - // Initialize auxiliary variables. - size_t row = 0; - size_t col = 0; - std::string line; - inFile.clear(); - inFile.seekg(0, std::ios::beg); - - while (std::getline(inFile, line)) - { - // Remove whitespaces from either side - trim(line); - // Reset the row we are looking at. (Remember this is transposed.) - row = 0; - std::stringstream lineStream; - std::string token; - - lineStream.clear(); - lineStream.str(line); - - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - // Remove whitespaces from either side - trim(token); - - if (token[0] == '"' && token[token.size() - 1] != '"') - { - // first part of the string - std::string tok = token; - - while (token[token.size() - 1] != '"') - { - tok += delim; - std::getline(lineStream, token, delim); - tok += token; - } - - token = tok; - } - - inout(row, col) = infoSet.template MapString(std::move(token), row); - row++; - } - - // Make sure we got the right number of rows. - if (row != rows) - { - std::ostringstream oss; - oss << "LoadCSV::TransposeParse(): wrong number of dimensions (" << row - << ") on line " << col << "; should be " << rows << " dimensions."; - throw std::runtime_error(oss.str()); - } - - // Increment the column index. - ++col; - } - } - - inline std::pair GetMatSize(std::fstream& f, const bool isNumeric, const char delim); + void TransposeParse(arma::Mat& inout, DatasetMapper& infoSet); //! Extension (type) of file. std::string extension; @@ -575,6 +287,7 @@ class LoadCSV } // namespace data } // namespace mlpack -#include "load_csv_impl.hpp" +#include "load_numeric_csv.hpp" +#include "load_categorical_csv.hpp" #endif diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index d963f92a021..0dd614ecb56 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -166,7 +166,7 @@ bool Load(const std::string& filename, if (loadType != FileType::HDF5Binary) { if (loadType == FileType::CSVASCII) - success = loader.LoadCSVFile(matrix, stream); + success = loader.LoadNumericCSV(matrix, stream); else success = matrix.load(stream, ToArmaFileType(loadType)); } @@ -236,7 +236,7 @@ bool Load(const std::string& filename, try { LoadCSV loader(filename); - loader.Load(matrix, info, transpose); + loader.LoadCategoricalCSV(matrix, info, transpose); } catch (std::exception& e) { diff --git a/src/mlpack/core/data/load_csv_impl.hpp b/src/mlpack/core/data/load_numeric_csv.hpp similarity index 50% rename from src/mlpack/core/data/load_csv_impl.hpp rename to src/mlpack/core/data/load_numeric_csv.hpp index 4854ca5b3d4..eed62e8e8df 100644 --- a/src/mlpack/core/data/load_csv_impl.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -1,47 +1,21 @@ -/** - * @file core/data/load_csv_impl.hpp - * - * @author Conrad Sanderson - * @author Gopi M. Tatiraju - * - * This csv parser is designed by taking reference from armadillo's csv parser. - * In this mlpack's version, all the arma dependencies were removed or replaced - * accordingly, making the parser totally independent of armadillo. - * - * This parser will be totally independent to any linear algebra library. - * This can be used to load data into any matrix, i.e. arma and bandicoot - * in future. +/** + * @file core/data/load_numeric_csv.hpp + * @author Gopi Tatiraju * - * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp - * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) - * Copyright 2008-2016 National ICT Australia (NICTA) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ------------------------------------------------------------------------ + * Load a matrix from file. Matrix should contain only numeric data. * * mlpack is free software; you may redistribute it and/or modify it under the * terms of the 3-clause BSD license. You should have received a copy of the * 3-clause BSD license along with mlpack. If not, see * http://www.opensource.org/licenses/BSD-3-Clause for more information. */ -#ifndef MLPACK_CORE_DATA_CSV_PARSER_IMPL_HPP -#define MLPACK_CORE_DATA_CSV_PARSER_IMPL_HPP +#ifndef MLPACK_CORE_DATA_LOAD_NUMERIC_CSV_HPP +#define MLPACK_CORE_DATA_LOAD_NUMERIC_CSV_HPP #include "load_csv.hpp" -namespace mlpack -{ -namespace data -{ +namespace mlpack{ +namespace data{ /** * Given the address of a matrix element(val) @@ -118,92 +92,14 @@ bool LoadCSV::ConvertToken(typename MatType::elem_type& val, return true; } -inline void LoadCSV::NumericParse(std::stringstream& lineStream, size_t& col, const char delim) -{ - std::string token; - - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - ++col; - } -} - -inline void LoadCSV::CategoricalParse(std::stringstream& lineStream, size_t& col, const char delim) -{ - std::string token; - - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - - if (token[0] == '"' && token[token.size() - 1] != '"') - { - while (token[token.size() - 1] != '"') - std::getline(lineStream, token, delim); - } - - ++col; - } -} - -inline std::pair LoadCSV::GetMatSize(std::fstream& f, const bool isNumeric = true, const char delim = ',') -{ - bool load_okay = f.good(); - - f.clear(); - - const std::fstream::pos_type pos1 = f.tellg(); - - size_t f_n_rows = 0; - size_t f_n_cols = 0; - - std::string lineString; - std::stringstream lineStream; - std::string token; - - while (f.good() && load_okay) - { - std::getline(f, lineString); - if (lineString.size() == 0) - break; - - lineStream.clear(); - lineStream.str(lineString); - - size_t line_n_cols = 0; - - if (isNumeric) - NumericParse(lineStream, line_n_cols, delim); - else - CategoricalParse(lineStream, line_n_cols, delim); - - if (f_n_cols < line_n_cols) - f_n_cols = line_n_cols; - - ++f_n_rows; - } - - f.clear(); - f.seekg(pos1); - - std::pair mat_size(f_n_rows, f_n_cols); - - return mat_size; -} - -/** -* Returns a bool value showing whether data was loaded successfully or not. -* Parses the file and loads the data into the given matrix. -*/ template -bool LoadCSV::LoadCSVFile(MatType& x, std::fstream& f) +bool LoadCSV::LoadNumericCSV(MatType& x, std::fstream& f) { bool load_okay = f.good(); f.clear(); - std::pair mat_size = GetMatSize(f); + std::pair mat_size = GetMatrixSize(f); x.zeros(mat_size.first, mat_size.second); @@ -242,6 +138,17 @@ bool LoadCSV::LoadCSVFile(MatType& x, std::fstream& f) return load_okay; } +inline void LoadCSV::NumericMatSize(std::stringstream& lineStream, size_t& col, const char delim) +{ + std::string token; + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + ++col; + } +} + } // namespace data } // namespace mlpack diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index 12ccdbc78bf..f6a4c0425b8 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -3,181 +3,181 @@ include(CTest) # mlpack test executable. add_executable(mlpack_test EXCLUDE_FROM_ALL - activation_functions_test.cpp - adaboost_test.cpp - akfn_test.cpp - aknn_test.cpp - ann_dist_test.cpp - ann_layer_test.cpp - ann_regularizer_test.cpp - ann_test_tools.hpp - ann_visitor_test.cpp - armadillo_svd_test.cpp - arma_extend_test.cpp - async_learning_test.cpp - augmented_rnns_tasks_test.cpp - bayesian_linear_regression_test.cpp - bias_svd_test.cpp - binarize_test.cpp - block_krylov_svd_test.cpp - callback_test.cpp - cf_test.cpp - cli_binding_test.cpp - convolutional_network_test.cpp - convolution_test.cpp - cosine_tree_test.cpp - cv_test.cpp - dbscan_test.cpp - dcgan_test.cpp - decision_tree_regressor_test.cpp - decision_tree_test.cpp - det_test.cpp - distribution_test.cpp - drusilla_select_test.cpp - emst_test.cpp - facilities_test.cpp - fastmks_test.cpp - feedforward_network_test.cpp - feedforward_network_2_test.cpp - gan_test.cpp - gmm_test.cpp - hmm_test.cpp - hpt_test.cpp - hoeffding_tree_test.cpp - hyperplane_test.cpp - image_load_test.cpp - imputation_test.cpp - init_rules_test.cpp - io_test.cpp - kde_test.cpp - kernel_pca_test.cpp - kernel_test.cpp - kernel_traits_test.cpp - kfn_test.cpp - kmeans_test.cpp - knn_test.cpp - krann_search_test.cpp - ksinit_test.cpp - lars_test.cpp - layer_names_test.cpp - lin_alg_test.cpp - linear_regression_test.cpp - lmnn_test.cpp - linear_svm_test.cpp + #activation_functions_test.cpp + #adaboost_test.cpp + #akfn_test.cpp + #aknn_test.cpp + #ann_dist_test.cpp + #ann_layer_test.cpp + #ann_regularizer_test.cpp + #ann_test_tools.hpp + #ann_visitor_test.cpp + #armadillo_svd_test.cpp + #arma_extend_test.cpp + #async_learning_test.cpp + #augmented_rnns_tasks_test.cpp + #bayesian_linear_regression_test.cpp + #bias_svd_test.cpp + #binarize_test.cpp + #block_krylov_svd_test.cpp + #callback_test.cpp + #cf_test.cpp + #cli_binding_test.cpp + #convolutional_network_test.cpp + #convolution_test.cpp + #cosine_tree_test.cpp + #cv_test.cpp + #dbscan_test.cpp + #dcgan_test.cpp + #decision_tree_regressor_test.cpp + #decision_tree_test.cpp + #det_test.cpp + #distribution_test.cpp + #drusilla_select_test.cpp + #emst_test.cpp + #facilities_test.cpp + #fastmks_test.cpp + #feedforward_network_test.cpp + #feedforward_network_2_test.cpp + #gan_test.cpp + #gmm_test.cpp + #hmm_test.cpp + #hpt_test.cpp + #hoeffding_tree_test.cpp + #hyperplane_test.cpp + #image_load_test.cpp + #imputation_test.cpp + #init_rules_test.cpp + #io_test.cpp + #kde_test.cpp + #kernel_pca_test.cpp + #kernel_test.cpp + #kernel_traits_test.cpp + #kfn_test.cpp + #kmeans_test.cpp + #knn_test.cpp + #krann_search_test.cpp + #ksinit_test.cpp + #lars_test.cpp + #layer_names_test.cpp + #lin_alg_test.cpp + #linear_regression_test.cpp + #lmnn_test.cpp + #linear_svm_test.cpp load_save_test.cpp - local_coordinate_coding_test.cpp - logistic_regression_test.cpp - log_test.cpp - loss_functions_test.cpp - lsh_test.cpp + #local_coordinate_coding_test.cpp + #logistic_regression_test.cpp + #log_test.cpp + #loss_functions_test.cpp + #lsh_test.cpp main.cpp - math_test.cpp - matrix_completion_test.cpp - maximal_inputs_test.cpp - metric_test.cpp - mean_shift_test.cpp - mock_categorical_data.hpp - nbc_test.cpp - nca_test.cpp - nmf_test.cpp - nystroem_method_test.cpp - octree_test.cpp - one_hot_encoding_test.cpp - pca_test.cpp - perceptron_test.cpp - prefixedoutstream_test.cpp - python_binding_test.cpp - qdafn_test.cpp - quic_svd_test.cpp - q_learning_test.cpp - radical_test.cpp - random_forest_test.cpp - random_test.cpp - randomized_svd_test.cpp - range_search_test.cpp - rbm_network_test.cpp - rectangle_tree_test.cpp - recurrent_network_test.cpp - rnn_reber_test.cpp - regularized_svd_test.cpp - reward_clipping_test.cpp - rl_components_test.cpp - scaling_test.cpp - size_checks_test.cpp - serialization.cpp - serialization.hpp - serialization_test.cpp - sfinae_test.cpp - softmax_regression_test.cpp - sort_policy_test.cpp - sparse_autoencoder_test.cpp - sparse_coding_test.cpp - spill_tree_test.cpp - split_data_test.cpp - string_encoding_test.cpp - sumtree_test.cpp - svd_batch_test.cpp - svd_incremental_test.cpp - svdplusplus_test.cpp - termination_policy_test.cpp - test_catch_tools.hpp - test_function_tools.hpp - timer_test.cpp - tree_test.cpp - tree_traits_test.cpp - ub_tree_test.cpp - union_find_test.cpp - vantage_point_tree_test.cpp - wgan_test.cpp - xgboost_test.cpp - main_tests/adaboost_test.cpp - main_tests/approx_kfn_test.cpp - main_tests/bayesian_linear_regression_test.cpp - main_tests/cf_test.cpp - main_tests/dbscan_test.cpp - main_tests/decision_tree_test.cpp - main_tests/det_test.cpp - main_tests/emst_test.cpp - main_tests/fastmks_test.cpp - main_tests/gmm_generate_test.cpp - main_tests/gmm_probability_test.cpp - main_tests/gmm_train_test.cpp - main_tests/hmm_generate_test.cpp - main_tests/hmm_loglik_test.cpp - main_tests/hmm_test_utils.hpp - main_tests/hmm_train_test.cpp - main_tests/hmm_viterbi_test.cpp - main_tests/hoeffding_tree_test.cpp - main_tests/image_converter_test.cpp - main_tests/kde_test.cpp - main_tests/kernel_pca_test.cpp - main_tests/kfn_test.cpp - main_tests/kmeans_test.cpp - main_tests/knn_test.cpp - main_tests/krann_test.cpp - main_tests/linear_regression_test.cpp - main_tests/lmnn_test.cpp - main_tests/linear_svm_test.cpp - main_tests/local_coordinate_coding_test.cpp - main_tests/logistic_regression_test.cpp - main_tests/lsh_test.cpp - main_tests/mean_shift_test.cpp - main_tests/nbc_test.cpp - main_tests/nca_test.cpp - main_tests/nmf_test.cpp - main_tests/pca_test.cpp - main_tests/perceptron_test.cpp - main_tests/preprocess_binarize_test.cpp - main_tests/preprocess_imputer_test.cpp - main_tests/preprocess_one_hot_encode_test.cpp - main_tests/preprocess_scale_test.cpp - main_tests/preprocess_split_test.cpp - main_tests/radical_test.cpp - main_tests/random_forest_test.cpp - main_tests/softmax_regression_test.cpp - main_tests/sparse_coding_test.cpp - main_tests/range_search_test.cpp - main_tests/test_helper.hpp + #math_test.cpp + #matrix_completion_test.cpp + #maximal_inputs_test.cpp + #metric_test.cpp + #mean_shift_test.cpp + #mock_categorical_data.hpp + #nbc_test.cpp + #nca_test.cpp + #nmf_test.cpp + #nystroem_method_test.cpp + #octree_test.cpp + #one_hot_encoding_test.cpp + #pca_test.cpp + #perceptron_test.cpp + #prefixedoutstream_test.cpp + #python_binding_test.cpp + #qdafn_test.cpp + #quic_svd_test.cpp + #q_learning_test.cpp + #radical_test.cpp + #random_forest_test.cpp + #random_test.cpp + #randomized_svd_test.cpp + #range_search_test.cpp + #rbm_network_test.cpp + #rectangle_tree_test.cpp + #recurrent_network_test.cpp + #rnn_reber_test.cpp + #regularized_svd_test.cpp + #reward_clipping_test.cpp + #rl_components_test.cpp + #scaling_test.cpp + #size_checks_test.cpp + #serialization.cpp + #serialization.hpp + #serialization_test.cpp + #sfinae_test.cpp + #softmax_regression_test.cpp + #sort_policy_test.cpp + #sparse_autoencoder_test.cpp + #sparse_coding_test.cpp + #spill_tree_test.cpp + #split_data_test.cpp + #string_encoding_test.cpp + #sumtree_test.cpp + #svd_batch_test.cpp + #svd_incremental_test.cpp + #svdplusplus_test.cpp + #termination_policy_test.cpp + #test_catch_tools.hpp + #test_function_tools.hpp + #timer_test.cpp + #tree_test.cpp + #tree_traits_test.cpp + #ub_tree_test.cpp + #union_find_test.cpp + #vantage_point_tree_test.cpp + #wgan_test.cpp + #xgboost_test.cpp + #main_tests/adaboost_test.cpp + #main_tests/approx_kfn_test.cpp + #main_tests/bayesian_linear_regression_test.cpp + #main_tests/cf_test.cpp + #main_tests/dbscan_test.cpp + #main_tests/decision_tree_test.cpp + #main_tests/det_test.cpp + #main_tests/emst_test.cpp + #main_tests/fastmks_test.cpp + #main_tests/gmm_generate_test.cpp + #main_tests/gmm_probability_test.cpp + #main_tests/gmm_train_test.cpp + #main_tests/hmm_generate_test.cpp + #main_tests/hmm_loglik_test.cpp + #main_tests/hmm_test_utils.hpp + #main_tests/hmm_train_test.cpp + #main_tests/hmm_viterbi_test.cpp + #main_tests/hoeffding_tree_test.cpp + #main_tests/image_converter_test.cpp + #main_tests/kde_test.cpp + #main_tests/kernel_pca_test.cpp + #main_tests/kfn_test.cpp + #main_tests/kmeans_test.cpp + #main_tests/knn_test.cpp + #main_tests/krann_test.cpp + #main_tests/linear_regression_test.cpp + #main_tests/lmnn_test.cpp + #main_tests/linear_svm_test.cpp + #main_tests/local_coordinate_coding_test.cpp + #main_tests/logistic_regression_test.cpp + #main_tests/lsh_test.cpp + #main_tests/mean_shift_test.cpp + #main_tests/nbc_test.cpp + #main_tests/nca_test.cpp + #main_tests/nmf_test.cpp + #main_tests/pca_test.cpp + #main_tests/perceptron_test.cpp + #main_tests/preprocess_binarize_test.cpp + #main_tests/preprocess_imputer_test.cpp + #main_tests/preprocess_one_hot_encode_test.cpp + #main_tests/preprocess_scale_test.cpp + #main_tests/preprocess_split_test.cpp + #main_tests/radical_test.cpp + #main_tests/random_forest_test.cpp + #main_tests/softmax_regression_test.cpp + #main_tests/sparse_coding_test.cpp + #main_tests/range_search_test.cpp + #main_tests/test_helper.hpp ) if(NOT BUILD_SHARED_LIBS) From b44a7eeaf519640691ce9ec21019d740a2473851 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sat, 14 Aug 2021 16:19:38 +0530 Subject: [PATCH 078/112] Adding comments --- src/mlpack/core/data/load_categorical_csv.hpp | 32 +++++++------ src/mlpack/core/data/load_csv.hpp | 45 ++++++++++++++----- src/mlpack/core/data/load_numeric_csv.hpp | 23 ++++++---- 3 files changed, 65 insertions(+), 35 deletions(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index ac011422608..949a30bc09c 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -31,20 +31,20 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, inFile.seekg(0, std::ios::beg); rows = 0; cols = 0; - + std::string line; while (std::getline(inFile, line)) { ++cols; // Remove whitespaces from either side trim(line); - + if (cols == 1) { // Extract the number of dimensions. std::pair dimen = GetMatrixSize(inFile, false, delim); rows = dimen.second; - + if (info.Dimensionality() == 0) { info.SetDimensionality(rows); @@ -58,39 +58,39 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, throw std::invalid_argument(oss.str()); } } - + // If we need to do a first pass for the DatasetMapper, do it. if (MapPolicy::NeedsFirstPass) { // In this case we must pass everything we parse to the MapPolicy. size_t dim = 0; - + std::stringstream lineStream; std::string token; - + lineStream.clear(); lineStream.str(line); - + while (lineStream.good()) { std::getline(lineStream, token, delim); // Remove whitespace from either side trim(token); - + if (token[0] == '"' && token[token.size() - 1] != '"') { std::string tok = token; - + while (token[token.size() - 1] != '"') { tok += delim; std::getline(lineStream, token, delim); tok += token; } - + token = tok; } - + info.template MapFirstPass(std::move(token), dim++); } } @@ -175,7 +175,7 @@ void LoadCSV::InitializeMapper(size_t& rows, size_t& cols, DatasetMapper& inout, DatasetMapper& inf std::getline(lineStream, token, delim); // Remove whitespaces from either side trim(token); - + if (token[0] == '"' && token[token.size() - 1] != '"') { // first part of the string @@ -301,7 +301,7 @@ void LoadCSV::NonTransposeParse(arma::Mat& inout, std::getline(lineStream, token, delim); tok += token; } - + token = tok; } @@ -328,15 +328,13 @@ void LoadCSV::LoadCategoricalCSV(MatType &inout, const bool transpose) { CheckOpen(); - + if (transpose) TransposeParse(inout, infoSet); else NonTransposeParse(inout, infoSet); } - - inline void LoadCSV::CategoricalMatSize(std::stringstream& lineStream, size_t& col, const char delim) { std::string token; diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 11a2490efa5..5955ec614e6 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -128,7 +128,15 @@ class LoadCSV template bool ConvertToken(typename MatType::elem_type& val, const std::string& token); - + /** + * Caluculate number of columns in each row + * and assign the value to the col. This fucntion + * will work only for numeric data. + * + * @param lineStream a single row of data + * @param col number of columns in lineStream + * @param delim delimiter character + */ inline void NumericMatSize(std::stringstream& lineStream, size_t& col, const char delim); @@ -176,6 +184,15 @@ class LoadCSV void InitializeTransposeMapper(size_t& rows, size_t& cols, DatasetMapper& info); + /** + * Caluculate number of columns in each row + * and assign the value to the col. This fucntion + * will work for categorical data. + * + * @param lineStream a single row of data + * @param col number of columns in lineStream + * @param delim delimiter character + */ inline void CategoricalMatSize(std::stringstream& lineStream, size_t& col, const char delim); @@ -189,7 +206,9 @@ class LoadCSV * @param isNumeric bool to ecide if data is numeric or categorical * @param delim char delimiter charecter */ - inline std::pair GetMatrixSize(std::fstream& f, const bool isNumeric = true, const char delim = ',') + inline std::pair GetMatrixSize(std::fstream& f, + const bool isNumeric = true, + const char delim = ',') { bool load_okay = f.good(); @@ -206,31 +225,37 @@ class LoadCSV while (f.good() && load_okay) { + // Get a row of data std::getline(f, lineString); if (lineString.size() == 0) break; - + lineStream.clear(); lineStream.str(lineString); - + size_t line_n_cols = 0; - + + // Get number of columns based on the type of data if (isNumeric) NumericMatSize(lineStream, line_n_cols, delim); else CategoricalMatSize(lineStream, line_n_cols, delim); - + + // If there are different number of columns in each + // row, then the highest number of cols will be + // considered as the size of the matrix. Missing + // elements will be filled as 0 if (f_n_cols < line_n_cols) f_n_cols = line_n_cols; - + ++f_n_rows; } - + f.clear(); f.seekg(pos1); - + std::pair mat_size(f_n_rows, f_n_cols); - + return mat_size; } diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index eed62e8e8df..3cdf3682143 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -17,17 +17,12 @@ namespace mlpack{ namespace data{ -/** -* Given the address of a matrix element(val) -* sets it equal to the provided value(token) -* example calling: convert_token(x.at(row, col), token) -*/ -template bool LoadCSV::ConvertToken(typename MatType::elem_type& val, const std::string& token) { const size_t N = size_t(token.length()); + // Fill empty data points with 0 if (N == 0) { val = typename MatType::elem_type(0); @@ -36,6 +31,9 @@ bool LoadCSV::ConvertToken(typename MatType::elem_type& val, const char* str = token.c_str(); + // Checks for +/-INF and NAN + // Converts them to their equivalent representation + // from numeric_limits if ((N == 3) || (N == 4)) { const bool neg = (str[0] == '-'); @@ -51,8 +49,9 @@ bool LoadCSV::ConvertToken(typename MatType::elem_type& val, ((sig_b == 'n') || (sig_b == 'N')) && ((sig_c == 'f') || (sig_c == 'F'))) { - val = neg ? -(std::numeric_limits::infinity()) : - std::numeric_limits::infinity(); + val = neg ? -(std::numeric_limits + ::infinity()) : std::numeric_limits::infinity(); return true; } else if (((sig_a == 'n') || (sig_a == 'N')) && @@ -66,6 +65,9 @@ bool LoadCSV::ConvertToken(typename MatType::elem_type& val, char* endptr = nullptr; + // Convert the token into ccorrect type. + // If we have a MatType::elem_type as unsigned int, + // it will convert all negative numbers to 0 if (std::is_floating_point::value) { val = typename MatType::elem_type(std::strtod(str, &endptr)); @@ -111,6 +113,7 @@ bool LoadCSV::LoadNumericCSV(MatType& x, std::fstream& f) while (f.good()) { + // Parse the file line by line std::getline(f, lineString); if (lineString.size() == 0) @@ -123,7 +126,11 @@ bool LoadCSV::LoadNumericCSV(MatType& x, std::fstream& f) while (lineStream.good()) { + // Parse each line std::getline(lineStream, token, ','); + + // This will handle loading of both dense and sparse. + // Initialize tmp_val of type MatType::elem_type with value 0. typename MatType::elem_type tmp_val = typename MatType::elem_type(0); if (ConvertToken(tmp_val, token)) From 22a96469e6ae2931215e11138f9fe2d0c2b3913c Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sat, 14 Aug 2021 16:32:56 +0530 Subject: [PATCH 079/112] solving a small error --- src/mlpack/core/data/load_numeric_csv.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index 3cdf3682143..d830916ca55 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -17,6 +17,7 @@ namespace mlpack{ namespace data{ +template bool LoadCSV::ConvertToken(typename MatType::elem_type& val, const std::string& token) { From 3be9474388a42f74be0f0119346080a539a05e6b Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sat, 14 Aug 2021 16:36:40 +0530 Subject: [PATCH 080/112] Running all tests --- src/mlpack/tests/CMakeLists.txt | 346 ++++++++++++++++---------------- 1 file changed, 173 insertions(+), 173 deletions(-) diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index f6a4c0425b8..12ccdbc78bf 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -3,181 +3,181 @@ include(CTest) # mlpack test executable. add_executable(mlpack_test EXCLUDE_FROM_ALL - #activation_functions_test.cpp - #adaboost_test.cpp - #akfn_test.cpp - #aknn_test.cpp - #ann_dist_test.cpp - #ann_layer_test.cpp - #ann_regularizer_test.cpp - #ann_test_tools.hpp - #ann_visitor_test.cpp - #armadillo_svd_test.cpp - #arma_extend_test.cpp - #async_learning_test.cpp - #augmented_rnns_tasks_test.cpp - #bayesian_linear_regression_test.cpp - #bias_svd_test.cpp - #binarize_test.cpp - #block_krylov_svd_test.cpp - #callback_test.cpp - #cf_test.cpp - #cli_binding_test.cpp - #convolutional_network_test.cpp - #convolution_test.cpp - #cosine_tree_test.cpp - #cv_test.cpp - #dbscan_test.cpp - #dcgan_test.cpp - #decision_tree_regressor_test.cpp - #decision_tree_test.cpp - #det_test.cpp - #distribution_test.cpp - #drusilla_select_test.cpp - #emst_test.cpp - #facilities_test.cpp - #fastmks_test.cpp - #feedforward_network_test.cpp - #feedforward_network_2_test.cpp - #gan_test.cpp - #gmm_test.cpp - #hmm_test.cpp - #hpt_test.cpp - #hoeffding_tree_test.cpp - #hyperplane_test.cpp - #image_load_test.cpp - #imputation_test.cpp - #init_rules_test.cpp - #io_test.cpp - #kde_test.cpp - #kernel_pca_test.cpp - #kernel_test.cpp - #kernel_traits_test.cpp - #kfn_test.cpp - #kmeans_test.cpp - #knn_test.cpp - #krann_search_test.cpp - #ksinit_test.cpp - #lars_test.cpp - #layer_names_test.cpp - #lin_alg_test.cpp - #linear_regression_test.cpp - #lmnn_test.cpp - #linear_svm_test.cpp + activation_functions_test.cpp + adaboost_test.cpp + akfn_test.cpp + aknn_test.cpp + ann_dist_test.cpp + ann_layer_test.cpp + ann_regularizer_test.cpp + ann_test_tools.hpp + ann_visitor_test.cpp + armadillo_svd_test.cpp + arma_extend_test.cpp + async_learning_test.cpp + augmented_rnns_tasks_test.cpp + bayesian_linear_regression_test.cpp + bias_svd_test.cpp + binarize_test.cpp + block_krylov_svd_test.cpp + callback_test.cpp + cf_test.cpp + cli_binding_test.cpp + convolutional_network_test.cpp + convolution_test.cpp + cosine_tree_test.cpp + cv_test.cpp + dbscan_test.cpp + dcgan_test.cpp + decision_tree_regressor_test.cpp + decision_tree_test.cpp + det_test.cpp + distribution_test.cpp + drusilla_select_test.cpp + emst_test.cpp + facilities_test.cpp + fastmks_test.cpp + feedforward_network_test.cpp + feedforward_network_2_test.cpp + gan_test.cpp + gmm_test.cpp + hmm_test.cpp + hpt_test.cpp + hoeffding_tree_test.cpp + hyperplane_test.cpp + image_load_test.cpp + imputation_test.cpp + init_rules_test.cpp + io_test.cpp + kde_test.cpp + kernel_pca_test.cpp + kernel_test.cpp + kernel_traits_test.cpp + kfn_test.cpp + kmeans_test.cpp + knn_test.cpp + krann_search_test.cpp + ksinit_test.cpp + lars_test.cpp + layer_names_test.cpp + lin_alg_test.cpp + linear_regression_test.cpp + lmnn_test.cpp + linear_svm_test.cpp load_save_test.cpp - #local_coordinate_coding_test.cpp - #logistic_regression_test.cpp - #log_test.cpp - #loss_functions_test.cpp - #lsh_test.cpp + local_coordinate_coding_test.cpp + logistic_regression_test.cpp + log_test.cpp + loss_functions_test.cpp + lsh_test.cpp main.cpp - #math_test.cpp - #matrix_completion_test.cpp - #maximal_inputs_test.cpp - #metric_test.cpp - #mean_shift_test.cpp - #mock_categorical_data.hpp - #nbc_test.cpp - #nca_test.cpp - #nmf_test.cpp - #nystroem_method_test.cpp - #octree_test.cpp - #one_hot_encoding_test.cpp - #pca_test.cpp - #perceptron_test.cpp - #prefixedoutstream_test.cpp - #python_binding_test.cpp - #qdafn_test.cpp - #quic_svd_test.cpp - #q_learning_test.cpp - #radical_test.cpp - #random_forest_test.cpp - #random_test.cpp - #randomized_svd_test.cpp - #range_search_test.cpp - #rbm_network_test.cpp - #rectangle_tree_test.cpp - #recurrent_network_test.cpp - #rnn_reber_test.cpp - #regularized_svd_test.cpp - #reward_clipping_test.cpp - #rl_components_test.cpp - #scaling_test.cpp - #size_checks_test.cpp - #serialization.cpp - #serialization.hpp - #serialization_test.cpp - #sfinae_test.cpp - #softmax_regression_test.cpp - #sort_policy_test.cpp - #sparse_autoencoder_test.cpp - #sparse_coding_test.cpp - #spill_tree_test.cpp - #split_data_test.cpp - #string_encoding_test.cpp - #sumtree_test.cpp - #svd_batch_test.cpp - #svd_incremental_test.cpp - #svdplusplus_test.cpp - #termination_policy_test.cpp - #test_catch_tools.hpp - #test_function_tools.hpp - #timer_test.cpp - #tree_test.cpp - #tree_traits_test.cpp - #ub_tree_test.cpp - #union_find_test.cpp - #vantage_point_tree_test.cpp - #wgan_test.cpp - #xgboost_test.cpp - #main_tests/adaboost_test.cpp - #main_tests/approx_kfn_test.cpp - #main_tests/bayesian_linear_regression_test.cpp - #main_tests/cf_test.cpp - #main_tests/dbscan_test.cpp - #main_tests/decision_tree_test.cpp - #main_tests/det_test.cpp - #main_tests/emst_test.cpp - #main_tests/fastmks_test.cpp - #main_tests/gmm_generate_test.cpp - #main_tests/gmm_probability_test.cpp - #main_tests/gmm_train_test.cpp - #main_tests/hmm_generate_test.cpp - #main_tests/hmm_loglik_test.cpp - #main_tests/hmm_test_utils.hpp - #main_tests/hmm_train_test.cpp - #main_tests/hmm_viterbi_test.cpp - #main_tests/hoeffding_tree_test.cpp - #main_tests/image_converter_test.cpp - #main_tests/kde_test.cpp - #main_tests/kernel_pca_test.cpp - #main_tests/kfn_test.cpp - #main_tests/kmeans_test.cpp - #main_tests/knn_test.cpp - #main_tests/krann_test.cpp - #main_tests/linear_regression_test.cpp - #main_tests/lmnn_test.cpp - #main_tests/linear_svm_test.cpp - #main_tests/local_coordinate_coding_test.cpp - #main_tests/logistic_regression_test.cpp - #main_tests/lsh_test.cpp - #main_tests/mean_shift_test.cpp - #main_tests/nbc_test.cpp - #main_tests/nca_test.cpp - #main_tests/nmf_test.cpp - #main_tests/pca_test.cpp - #main_tests/perceptron_test.cpp - #main_tests/preprocess_binarize_test.cpp - #main_tests/preprocess_imputer_test.cpp - #main_tests/preprocess_one_hot_encode_test.cpp - #main_tests/preprocess_scale_test.cpp - #main_tests/preprocess_split_test.cpp - #main_tests/radical_test.cpp - #main_tests/random_forest_test.cpp - #main_tests/softmax_regression_test.cpp - #main_tests/sparse_coding_test.cpp - #main_tests/range_search_test.cpp - #main_tests/test_helper.hpp + math_test.cpp + matrix_completion_test.cpp + maximal_inputs_test.cpp + metric_test.cpp + mean_shift_test.cpp + mock_categorical_data.hpp + nbc_test.cpp + nca_test.cpp + nmf_test.cpp + nystroem_method_test.cpp + octree_test.cpp + one_hot_encoding_test.cpp + pca_test.cpp + perceptron_test.cpp + prefixedoutstream_test.cpp + python_binding_test.cpp + qdafn_test.cpp + quic_svd_test.cpp + q_learning_test.cpp + radical_test.cpp + random_forest_test.cpp + random_test.cpp + randomized_svd_test.cpp + range_search_test.cpp + rbm_network_test.cpp + rectangle_tree_test.cpp + recurrent_network_test.cpp + rnn_reber_test.cpp + regularized_svd_test.cpp + reward_clipping_test.cpp + rl_components_test.cpp + scaling_test.cpp + size_checks_test.cpp + serialization.cpp + serialization.hpp + serialization_test.cpp + sfinae_test.cpp + softmax_regression_test.cpp + sort_policy_test.cpp + sparse_autoencoder_test.cpp + sparse_coding_test.cpp + spill_tree_test.cpp + split_data_test.cpp + string_encoding_test.cpp + sumtree_test.cpp + svd_batch_test.cpp + svd_incremental_test.cpp + svdplusplus_test.cpp + termination_policy_test.cpp + test_catch_tools.hpp + test_function_tools.hpp + timer_test.cpp + tree_test.cpp + tree_traits_test.cpp + ub_tree_test.cpp + union_find_test.cpp + vantage_point_tree_test.cpp + wgan_test.cpp + xgboost_test.cpp + main_tests/adaboost_test.cpp + main_tests/approx_kfn_test.cpp + main_tests/bayesian_linear_regression_test.cpp + main_tests/cf_test.cpp + main_tests/dbscan_test.cpp + main_tests/decision_tree_test.cpp + main_tests/det_test.cpp + main_tests/emst_test.cpp + main_tests/fastmks_test.cpp + main_tests/gmm_generate_test.cpp + main_tests/gmm_probability_test.cpp + main_tests/gmm_train_test.cpp + main_tests/hmm_generate_test.cpp + main_tests/hmm_loglik_test.cpp + main_tests/hmm_test_utils.hpp + main_tests/hmm_train_test.cpp + main_tests/hmm_viterbi_test.cpp + main_tests/hoeffding_tree_test.cpp + main_tests/image_converter_test.cpp + main_tests/kde_test.cpp + main_tests/kernel_pca_test.cpp + main_tests/kfn_test.cpp + main_tests/kmeans_test.cpp + main_tests/knn_test.cpp + main_tests/krann_test.cpp + main_tests/linear_regression_test.cpp + main_tests/lmnn_test.cpp + main_tests/linear_svm_test.cpp + main_tests/local_coordinate_coding_test.cpp + main_tests/logistic_regression_test.cpp + main_tests/lsh_test.cpp + main_tests/mean_shift_test.cpp + main_tests/nbc_test.cpp + main_tests/nca_test.cpp + main_tests/nmf_test.cpp + main_tests/pca_test.cpp + main_tests/perceptron_test.cpp + main_tests/preprocess_binarize_test.cpp + main_tests/preprocess_imputer_test.cpp + main_tests/preprocess_one_hot_encode_test.cpp + main_tests/preprocess_scale_test.cpp + main_tests/preprocess_split_test.cpp + main_tests/radical_test.cpp + main_tests/random_forest_test.cpp + main_tests/softmax_regression_test.cpp + main_tests/sparse_coding_test.cpp + main_tests/range_search_test.cpp + main_tests/test_helper.hpp ) if(NOT BUILD_SHARED_LIBS) From 7569626a8de23492ef1b373f718ccd8c0ce24096 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Mon, 16 Aug 2021 12:56:20 +0530 Subject: [PATCH 081/112] Indentation | Converted GetMatrixSize() to template fuction --- src/mlpack/core/data/load_categorical_csv.hpp | 83 ++++++++----------- src/mlpack/core/data/load_csv.hpp | 23 +++-- src/mlpack/core/data/load_impl.hpp | 10 +-- src/mlpack/core/data/load_numeric_csv.hpp | 9 +- 4 files changed, 53 insertions(+), 72 deletions(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index 949a30bc09c..cf32f3dac28 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -17,9 +17,39 @@ namespace mlpack{ namespace data{ +template +void LoadCSV::LoadCategoricalCSV(MatType &inout, + DatasetMapper &infoSet, + const bool transpose) +{ + CheckOpen(); + + if (transpose) + TransposeParse(inout, infoSet); + else + NonTransposeParse(inout, infoSet); +} + +inline void LoadCSV::CategoricalMatSize(std::stringstream& lineStream, size_t& col, const char delim) +{ + std::string token; + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + while (token[token.size() - 1] != '"') + std::getline(lineStream, token, delim); + } + + ++col; + } +} + template void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, - DatasetMapper& info) + DatasetMapper& info) { // Take a pass through the file. If the DatasetMapper policy requires it, // we will pass everything string through MapString(). This might be useful @@ -42,7 +72,7 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, if (cols == 1) { // Extract the number of dimensions. - std::pair dimen = GetMatrixSize(inFile, false, delim); + std::pair dimen = GetMatrixSize(inFile, delim); rows = dimen.second; if (info.Dimensionality() == 0) @@ -64,7 +94,6 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, { // In this case we must pass everything we parse to the MapPolicy. size_t dim = 0; - std::stringstream lineStream; std::string token; @@ -87,10 +116,8 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, std::getline(lineStream, token, delim); tok += token; } - token = tok; } - info.template MapFirstPass(std::move(token), dim++); } } @@ -143,7 +170,7 @@ void LoadCSV::InitializeMapper(size_t& rows, size_t& cols, DatasetMapper dimen = GetMatrixSize(inFile, false, delim); + std::pair dimen = GetMatrixSize(inFile, delim); cols = dimen.second; } @@ -152,7 +179,6 @@ void LoadCSV::InitializeMapper(size_t& rows, size_t& cols, DatasetMapper(std::move(token), rows - 1); } } @@ -210,7 +233,6 @@ void LoadCSV::TransposeParse(arma::Mat& inout, DatasetMapper& inf row = 0; std::stringstream lineStream; std::string token; - lineStream.clear(); lineStream.str(line); @@ -224,17 +246,14 @@ void LoadCSV::TransposeParse(arma::Mat& inout, DatasetMapper& inf { // first part of the string std::string tok = token; - while (token[token.size() - 1] != '"') { tok += delim; std::getline(lineStream, token, delim); tok += token; } - token = tok; } - inout(row, col) = infoSet.template MapString(std::move(token), row); row++; } @@ -247,7 +266,6 @@ void LoadCSV::TransposeParse(arma::Mat& inout, DatasetMapper& inf << ") on line " << col << "; should be " << rows << " dimensions."; throw std::runtime_error(oss.str()); } - // Increment the column index. ++col; } @@ -294,17 +312,14 @@ void LoadCSV::NonTransposeParse(arma::Mat& inout, if (token[0] == '"' && token[token.size() - 1] != '"') { std::string tok = token; - while (token[token.size() - 1] != '"') { tok += delim; std::getline(lineStream, token, delim); tok += token; } - token = tok; } - inout(row, col++) = infoSet.template MapString(std::move(token), row); } @@ -317,42 +332,10 @@ void LoadCSV::NonTransposeParse(arma::Mat& inout, << " dimensions."; throw std::runtime_error(oss.str()); } - ++row; col = 0; } } -template -void LoadCSV::LoadCategoricalCSV(MatType &inout, - DatasetMapper &infoSet, - const bool transpose) -{ - CheckOpen(); - - if (transpose) - TransposeParse(inout, infoSet); - else - NonTransposeParse(inout, infoSet); -} - -inline void LoadCSV::CategoricalMatSize(std::stringstream& lineStream, size_t& col, const char delim) -{ - std::string token; - - while (lineStream.good()) - { - std::getline(lineStream, token, delim); - - if (token[0] == '"' && token[token.size() - 1] != '"') - { - while (token[token.size() - 1] != '"') - std::getline(lineStream, token, delim); - } - - ++col; - } -} - } //namespace data } //namespace mlpack diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 5955ec614e6..3cabc265d98 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -71,7 +71,10 @@ class LoadCSV /** * Construct the LoadCSV object on the given file. This will construct the - * rules necessary for loading and attempt to open the file. + * rules necessary for loading and attempt to open the file. This will also + * initialize the delimiter character for parsing. + * + * @param file path of the dataset */ LoadCSV(const std::string& file) : extension(Extension(file)), @@ -203,26 +206,25 @@ class LoadCSV * for both numeric_parse and categorical_parse. * * @param f fstream stream to open the data file - * @param isNumeric bool to ecide if data is numeric or categorical * @param delim char delimiter charecter */ + template inline std::pair GetMatrixSize(std::fstream& f, - const bool isNumeric = true, const char delim = ',') { bool load_okay = f.good(); - + f.clear(); - + const std::fstream::pos_type pos1 = f.tellg(); - + size_t f_n_rows = 0; size_t f_n_cols = 0; - + std::string lineString; std::stringstream lineStream; std::string token; - + while (f.good() && load_okay) { // Get a row of data @@ -266,15 +268,18 @@ class LoadCSV * Check whether or not the file has successfully opened; throw an exception * if not. */ - void CheckOpen() + inline void CheckOpen() { + // check if file is opening if (!inFile.is_open()) { std::ostringstream oss; oss << "Cannot open file '" << filename << "'. " << std::endl; + // throw an exception if file is not opening throw std::runtime_error(oss.str()); } + // clear format flag inFile.unsetf(std::ios::skipws); } diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 0dd614ecb56..4475e3e64ba 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -93,11 +93,11 @@ bool Load(const std::string& filename, // Catch nonexistent files by opening the stream ourselves. std::fstream stream; - #ifdef _WIN32 // Always open in binary mode on Windows. +#ifdef _WIN32 // Always open in binary mode on Windows. stream.open(filename.c_str(), std::fstream::in | std::fstream::binary); - #else +#else stream.open(filename.c_str(), std::fstream::in); - #endif +#endif if (!stream.is_open()) { Timer::Stop("loading_data"); @@ -133,7 +133,7 @@ bool Load(const std::string& filename, stringType = GetStringType(loadType); - #ifndef ARMA_USE_HDF5 +#ifndef ARMA_USE_HDF5 if (inputLoadType == FileType::HDF5Binary) { // Ensure that HDF5 is supported. @@ -149,7 +149,7 @@ bool Load(const std::string& filename, return false; } - #endif +#endif // Try to load the file; but if it's raw_binary, it could be a problem. if (loadType == FileType::RawBinary) diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index d830916ca55..77454c1c529 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -22,7 +22,6 @@ bool LoadCSV::ConvertToken(typename MatType::elem_type& val, const std::string& token) { const size_t N = size_t(token.length()); - // Fill empty data points with 0 if (N == 0) { @@ -84,7 +83,6 @@ bool LoadCSV::ConvertToken(typename MatType::elem_type& val, val = typename MatType::elem_type(0); return true; } - val = typename MatType::elem_type( std::strtoull(str, &endptr, 10)); } } @@ -99,13 +97,9 @@ template bool LoadCSV::LoadNumericCSV(MatType& x, std::fstream& f) { bool load_okay = f.good(); - f.clear(); - - std::pair mat_size = GetMatrixSize(f); - + std::pair mat_size = GetMatrixSize(f); x.zeros(mat_size.first, mat_size.second); - size_t row = 0; std::string lineString; @@ -140,7 +134,6 @@ bool LoadCSV::LoadNumericCSV(MatType& x, std::fstream& f) ++col; } } - ++row; } return load_okay; From 589689e966d8fcb83500d065243b99671bad8df3 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sat, 21 Aug 2021 12:07:59 +0530 Subject: [PATCH 082/112] Changing template parameter MatType to eT in Load() --- src/mlpack/core/data/load.hpp | 49 ++++++- src/mlpack/core/data/load_categorical_csv.hpp | 4 +- src/mlpack/core/data/load_csv.hpp | 15 +- src/mlpack/core/data/load_impl.hpp | 132 +++++++++++++++++- src/mlpack/core/data/load_numeric_csv.hpp | 39 +++--- src/mlpack/core/data/save.hpp | 4 +- src/mlpack/core/data/save_impl.hpp | 6 +- 7 files changed, 202 insertions(+), 47 deletions(-) diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 4a664a7824c..bd0fd7a0664 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -69,12 +69,47 @@ namespace data /** Functions to load and save matrices and models. */ { * @param inputLoadType Used to determine the type of file to load (default arma::auto_detect). * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, - MatType& matrix, - const bool fatal = false, - const bool transpose = true, - const FileType inputLoadType = FileType::AutoDetect); + arma::Mat& matrix, + const bool fatal = false, + const bool transpose = true, + const FileType inputLoadType = FileType::AutoDetect); + +/** + * Loads a sparse matrix from file, using arma::coord_ascii format. This + * will transpose the matrix at load time (unless the transpose parameter is set + * to false). If the filetype cannot be determined, an error will be given. + * + * The supported types of files are the same as found in Armadillo: + * + * - TSV (coord_ascii), denoted by .tsv or .txt + * - TXT (coord_ascii), denoted by .txt + * - Raw binary (raw_binary), denoted by .bin + * - Armadillo binary (arma_binary), denoted by .bin + * + * If the file extension is not one of those types, an error will be given. + * This is preferable to Armadillo's default behavior of loading an unknown + * filetype as raw_binary, which can have very confusing effects. + * + * If the parameter 'fatal' is set to true, a std::runtime_error exception will + * be thrown if the matrix does not load successfully. The parameter + * 'transpose' controls whether or not the matrix is transposed after loading. + * In most cases, because data is generally stored in a row-major format and + * mlpack requires column-major matrices, this should be left at its default + * value of 'true'. + * + * @param filename Name of file to load. + * @param matrix Sparse matrix to load contents of file into. + * @param fatal If an error should be reported as fatal (default false). + * @param transpose If true, transpose the matrix after loading (default true). + * @return Boolean value indicating success or failure of load. + */ +template +bool Load(const std::string& filename, + arma::SpMat& matrix, + const bool fatal = false, + const bool transpose = true); /** * Load a column vector from a file, guessing the filetype from the extension. @@ -176,9 +211,9 @@ bool Load(const std::string& filename, * @param transpose If true, transpose the matrix after loading. * @return Boolean value indicating success or failure of load. */ -template +template bool Load(const std::string& filename, - MatType& matrix, + arma::Mat& matrix, DatasetMapper& info, const bool fatal = false, const bool transpose = true); diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index cf32f3dac28..d512c948a76 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -17,8 +17,8 @@ namespace mlpack{ namespace data{ -template -void LoadCSV::LoadCategoricalCSV(MatType &inout, +template +void LoadCSV::LoadCategoricalCSV(arma::Mat &inout, DatasetMapper &infoSet, const bool transpose) { diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 3cabc265d98..c95a5181457 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -108,14 +108,11 @@ class LoadCSV * the second pass, the function converts each value to required datatype * and sets it equal to val. * - * This function uses MatType as template parameter in order to provide - * support for any type of matrices from any linear algebra library. - * * @param x Matrix in which data will be loaded * @param f File stream to access the data file */ - template - bool LoadNumericCSV(MatType& x, std::fstream& f); + template + bool LoadNumericCSV(arma::Mat& x, std::fstream& f); /** * Convert the given string token to assigned datatype and assign @@ -128,8 +125,8 @@ class LoadCSV * @param val Token's value will be assigned to this address * @param token Value which should be assigned */ - template - bool ConvertToken(typename MatType::elem_type& val, const std::string& token); + template + bool ConvertToken(eT& val, const std::string& token); /** * Caluculate number of columns in each row @@ -154,8 +151,8 @@ class LoadCSV * @param transpose If true, the matrix should be transposed on loading * (default). */ - template - void LoadCategoricalCSV(MatType &inout, + template + void LoadCategoricalCSV(arma::Mat &inout, DatasetMapper &infoSet, const bool transpose = true); diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 4475e3e64ba..7b9f8eac909 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -81,9 +81,9 @@ bool inline inplace_transpose(MatType& X, bool fatal) } } -template +template bool Load(const std::string& filename, - MatType& matrix, + arma::Mat& matrix, const bool fatal, const bool transpose, const FileType inputLoadType) @@ -201,9 +201,9 @@ bool Load(const std::string& filename, } // Load with mappings. Unfortunately we have to implement this ourselves. -template +template bool Load(const std::string& filename, - MatType& matrix, + arma::Mat& matrix, DatasetMapper& info, const bool fatal, const bool transpose) @@ -296,6 +296,130 @@ bool Load(const std::string& filename, return true; } +// For loading data into sparse matrix +template +bool Load(const std::string& filename, + arma::SpMat& matrix, + const bool fatal, + const bool transpose) +{ + Timer::Start("loading_data"); + + // Get the extension. + std::string extension = Extension(filename); + + // Catch nonexistent files by opening the stream ourselves. + std::fstream stream; +#ifdef _WIN32 // Always open in binary mode on Windows. + stream.open(filename.c_str(), std::fstream::in | std::fstream::binary); +#else + stream.open(filename.c_str(), std::fstream::in); +#endif + if (!stream.is_open()) + { + Timer::Stop("loading_data"); + if (fatal) + Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl; + else + Log::Warn << "Cannot open file '" << filename << "'; load failed." + << std::endl; + + return false; + } + + bool unknownType = false; + arma::file_type loadType; + std::string stringType; + + if (extension == "tsv" || extension == "txt") + { + loadType = arma::coord_ascii; + stringType = "Coordinate Formatted Data for Sparse Matrix"; + } + else if (extension == "bin") + { + // This could be raw binary or Armadillo binary (binary with header). We + // will check to see if it is Armadillo binary. + const std::string ARMA_SPM_BIN = "ARMA_SPM_BIN"; + std::string rawHeader(ARMA_SPM_BIN.length(), '\0'); + + std::streampos pos = stream.tellg(); + + stream.read(&rawHeader[0], std::streamsize(ARMA_SPM_BIN.length())); + stream.clear(); + stream.seekg(pos); // Reset stream position after peeking. + + if (rawHeader == ARMA_SPM_BIN) + { + stringType = "Armadillo binary formatted data for sparse matrix"; + loadType = arma::arma_binary; + } + else // We can only assume it's raw binary. + { + stringType = "raw binary formatted data"; + loadType = arma::raw_binary; + } + } + else // Unknown extension... + { + unknownType = true; + loadType = arma::raw_binary; // Won't be used; prevent a warning. + stringType = ""; + } + + // Provide error if we don't know the type. + if (unknownType) + { + Timer::Stop("loading_data"); + if (fatal) + Log::Fatal << "Unable to detect type of '" << filename << "'; " + << "incorrect extension?" << std::endl; + else + Log::Warn << "Unable to detect type of '" << filename << "'; load failed." + << " Incorrect extension?" << std::endl; + + return false; + } + + // Try to load the file; but if it's raw_binary, it could be a problem. + if (loadType == arma::raw_binary) + Log::Warn << "Loading '" << filename << "' as " << stringType << "; " + << "but this may not be the actual filetype!" << std::endl; + else + Log::Info << "Loading '" << filename << "' as " << stringType << ". " + << std::flush; + + bool success; + + success = matrix.load(stream, loadType); + + if (!success) + { + Log::Info << std::endl; + Timer::Stop("loading_data"); + if (fatal) + Log::Fatal << "Loading from '" << filename << "' failed." << std::endl; + else + Log::Warn << "Loading from '" << filename << "' failed." << std::endl; + + return false; + } + else + Log::Info << "Size is " << (transpose ? matrix.n_cols : matrix.n_rows) + << " x " << (transpose ? matrix.n_rows : matrix.n_cols) << ".\n"; + + // Now transpose the matrix, if necessary. + if (transpose) + { + success = inplace_transpose(matrix, fatal); + } + + Timer::Stop("loading_data"); + + // Finally, return the success indicator. + return success; +} + } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index 77454c1c529..b5f7535bea8 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -17,15 +17,15 @@ namespace mlpack{ namespace data{ -template -bool LoadCSV::ConvertToken(typename MatType::elem_type& val, +template +bool LoadCSV::ConvertToken(eT& val, const std::string& token) { const size_t N = size_t(token.length()); // Fill empty data points with 0 if (N == 0) { - val = typename MatType::elem_type(0); + val = eT(0); return true; } @@ -49,16 +49,15 @@ bool LoadCSV::ConvertToken(typename MatType::elem_type& val, ((sig_b == 'n') || (sig_b == 'N')) && ((sig_c == 'f') || (sig_c == 'F'))) { - val = neg ? -(std::numeric_limits - ::infinity()) : std::numeric_limits::infinity(); + val = neg ? -(std::numeric_limits + ::infinity()) : std::numeric_limits::infinity(); return true; } else if (((sig_a == 'n') || (sig_a == 'N')) && ((sig_b == 'a') || (sig_b == 'A')) && ((sig_c == 'n') || (sig_c == 'N'))) { - val = std::numeric_limits::quiet_NaN(); + val = std::numeric_limits::quiet_NaN(); return true; } } @@ -66,24 +65,24 @@ bool LoadCSV::ConvertToken(typename MatType::elem_type& val, char* endptr = nullptr; // Convert the token into ccorrect type. - // If we have a MatType::elem_type as unsigned int, + // If we have a eT as unsigned int, // it will convert all negative numbers to 0 - if (std::is_floating_point::value) + if (std::is_floating_point::value) { - val = typename MatType::elem_type(std::strtod(str, &endptr)); + val = eT(std::strtod(str, &endptr)); } - else if (std::is_integral::value) + else if (std::is_integral::value) { - if (std::is_signed::value) - val = typename MatType::elem_type(std::strtoll(str, &endptr, 10)); + if (std::is_signed::value) + val = eT(std::strtoll(str, &endptr, 10)); else { if (str[0] == '-') { - val = typename MatType::elem_type(0); + val = eT(0); return true; } - val = typename MatType::elem_type( std::strtoull(str, &endptr, 10)); + val = eT(std::strtoull(str, &endptr, 10)); } } @@ -93,8 +92,8 @@ bool LoadCSV::ConvertToken(typename MatType::elem_type& val, return true; } -template -bool LoadCSV::LoadNumericCSV(MatType& x, std::fstream& f) +template +bool LoadCSV::LoadNumericCSV(arma::Mat& x, std::fstream& f) { bool load_okay = f.good(); f.clear(); @@ -125,10 +124,10 @@ bool LoadCSV::LoadNumericCSV(MatType& x, std::fstream& f) std::getline(lineStream, token, ','); // This will handle loading of both dense and sparse. - // Initialize tmp_val of type MatType::elem_type with value 0. - typename MatType::elem_type tmp_val = typename MatType::elem_type(0); + // Initialize tmp_val of type eT with value 0. + eT tmp_val = eT(0); - if (ConvertToken(tmp_val, token)) + if (ConvertToken(tmp_val, token)) { x.at(row, col) = tmp_val; ++col; diff --git a/src/mlpack/core/data/save.hpp b/src/mlpack/core/data/save.hpp index 3487e22726f..19e11ee1616 100644 --- a/src/mlpack/core/data/save.hpp +++ b/src/mlpack/core/data/save.hpp @@ -60,9 +60,9 @@ namespace data /** Functions to load and save matrices. */ { * @param inputSaveType File type to save to (defaults to arma::auto_detect). * @return Boolean value indicating success or failure of save. */ -template +template bool Save(const std::string& filename, - const MatType& matrix, + const arma::Mat& matrix, const bool fatal = false, bool transpose = true, FileType inputSaveType = FileType::AutoDetect); diff --git a/src/mlpack/core/data/save_impl.hpp b/src/mlpack/core/data/save_impl.hpp index 49d95f90b45..18a353dbc69 100644 --- a/src/mlpack/core/data/save_impl.hpp +++ b/src/mlpack/core/data/save_impl.hpp @@ -42,9 +42,9 @@ bool Save(const std::string& filename, return Save(filename, rowvec, fatal, true, inputSaveType); } -template +template bool Save(const std::string& filename, - const MatType& matrix, + const arma::Mat& matrix, const bool fatal, bool transpose, FileType inputSaveType) @@ -100,7 +100,7 @@ bool Save(const std::string& filename, // Transpose the matrix. if (transpose) { - MatType tmp = trans(matrix); + arma::Mat tmp = trans(matrix); #ifdef ARMA_USE_HDF5 // We can't save with streams for HDF5. From c78e62698f164914b3b7f8234ad1b44571cf80f3 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 22 Aug 2021 12:31:39 +0530 Subject: [PATCH 083/112] More indentation issues --- src/mlpack/core/data/load_categorical_csv.hpp | 2 -- src/mlpack/core/data/load_csv.hpp | 23 ++++++++----------- src/mlpack/core/data/load_numeric_csv.hpp | 1 - 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index d512c948a76..e592e7586ba 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -42,7 +42,6 @@ inline void LoadCSV::CategoricalMatSize(std::stringstream& lineStream, size_t& c while (token[token.size() - 1] != '"') std::getline(lineStream, token, delim); } - ++col; } } @@ -109,7 +108,6 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, if (token[0] == '"' && token[token.size() - 1] != '"') { std::string tok = token; - while (token[token.size() - 1] != '"') { tok += delim; diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index c95a5181457..eec3c1b66be 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -12,7 +12,7 @@ * * As the implementation is inspired from Armadillo it * is necessary to add two different licenses. One for - * Armadillo and other for mlpack. + * Armadillo and another for mlpack. * * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp * @@ -56,7 +56,7 @@ namespace mlpack { namespace data { /** - * Load the csv file.This class contains fucntions + * Load the csv file. This class contains functions * to load numeric and categorical data. */ class LoadCSV @@ -71,7 +71,7 @@ class LoadCSV /** * Construct the LoadCSV object on the given file. This will construct the - * rules necessary for loading and attempt to open the file. This will also + * rules necessary for loading and will attempt to open the file. This will also * initialize the delimiter character for parsing. * * @param file path of the dataset @@ -97,12 +97,12 @@ class LoadCSV CheckOpen(); } - // Fucntions for Numeric Parser + // Functions for Numeric Parser /** * Returns a bool value showing whether data was loaded successfully or not. * - * Parses a csv file and loads the data into a given matrix. In the first pass, + * Parses a csv file and loads the data into the given matrix. In the first pass, * the function will determine the number of cols and rows in the given file. * Once the rows and cols are fixed we initialize the matrix with zeros. In * the second pass, the function converts each value to required datatype @@ -115,9 +115,9 @@ class LoadCSV bool LoadNumericCSV(arma::Mat& x, std::fstream& f); /** - * Convert the given string token to assigned datatype and assign + * Converts the given string token to assigned datatype and assigns * this value to the given address. The address here will be a - * matrix location. + * matrix location eg. matrix(row, col). * * Token is always read as a string, if the given token is +/-INF or NAN * it converts them to infinity and NAN using numeric_limits. @@ -182,11 +182,11 @@ class LoadCSV */ template void InitializeTransposeMapper(size_t& rows, size_t& cols, - DatasetMapper& info); + DatasetMapper& info); /** - * Caluculate number of columns in each row - * and assign the value to the col. This fucntion + * Calculate number of columns in each row + * and assign the value to the col. This function * will work for categorical data. * * @param lineStream a single row of data @@ -212,12 +212,10 @@ class LoadCSV bool load_okay = f.good(); f.clear(); - const std::fstream::pos_type pos1 = f.tellg(); size_t f_n_rows = 0; size_t f_n_cols = 0; - std::string lineString; std::stringstream lineStream; std::string token; @@ -231,7 +229,6 @@ class LoadCSV lineStream.clear(); lineStream.str(lineString); - size_t line_n_cols = 0; // Get number of columns based on the type of data diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index b5f7535bea8..774680e3ec3 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -141,7 +141,6 @@ bool LoadCSV::LoadNumericCSV(arma::Mat& x, std::fstream& f) inline void LoadCSV::NumericMatSize(std::stringstream& lineStream, size_t& col, const char delim) { std::string token; - while (lineStream.good()) { std::getline(lineStream, token, delim); From 2707ff1bf1177780b6f82563c72f965d3440214b Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sun, 22 Aug 2021 12:32:08 +0530 Subject: [PATCH 084/112] Adding a tutorial for DatasetMapper --- doc/tutorials/data_loading/datasetmapper.txt | 178 +++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 doc/tutorials/data_loading/datasetmapper.txt diff --git a/doc/tutorials/data_loading/datasetmapper.txt b/doc/tutorials/data_loading/datasetmapper.txt new file mode 100644 index 00000000000..ef013b3550e --- /dev/null +++ b/doc/tutorials/data_loading/datasetmapper.txt @@ -0,0 +1,178 @@ +/*! + +@file datasetmapper.txt +@author Gopi Tatiraju +@breif Introduction and tutorial for how to use DatasetMapper in mlpack. + +@page datasetmapper DatasetMapper Tutorial + +@section intro_datasetmapper Introduction + +DatasetMapper is a class which holds information about a dataset. This can be +used when dataset contains categorical non-numeric features which should be +mapped to numeric features. A simple example can be + +``` +7,5,True,3 +6,3,False,4 +4,8,False,2 +9,3,True,3 +``` + +The above dataset will be respresented as + +``` +7,5,0,3 +6,3,1,4 +4,8,1,2 +9,3,0,3 +``` + +Here Mappings are + +- True mapped to 0 +- False mapped to 1 + +DatasetMapper provides an easy API to load such data and stores all the +necessary information of the dataset. + +@section toc_datasetmapper Table of Contents + +A list of all sections + + - \ref intro_datasetmapper + - \ref toc_datasetmapper + - \ref load + - \ref dimensions + - \ref type + - \ref numofmappings + - \ref checkmappings + - \ref unmapstring + - \ref unmapvalue + +@section load Loading data + +To use \b DatasetMapper we have to call a specific overload of Load fucntion. + +@code +using namespace mlpack; + +arma::mat data; +data::DatasetMapper info; +mlpack::data::Load("dataset.csv", data, info); +@endcode + +Dataset +``` +7, 5, True, 3 +6, 3, False, 4 +4, 8, False, 2 +9, 3, True, 3 +``` + +@section dimensions Dimentionality + +There are two ways to initialize a DatasetMapper object. First is to initialize +the object and set each property yourself and second is to pass the object to +Load() in which case mlpack will populate the object. If we use the later option +then the dimentionality will be same as the number of rows in the dataset. + +@code +std::cout << info.Dimentionality(); +@endcode + +@code +4 +@endcode + +@section type Type of each Dimension + +Each dimension can be of either of the two types + - numeric + - categorical + +\b Type(size_t dimension) takes an argument dimension which is basically the row +number for which you want to know the type + +This will return either 0 or 1. + - 0 represents numeric + - 1 represents categorical + +@code +std::cout << info.Type(0) << "\n"; +std::cout << info.Type(1) << "\n"; +std::cout << info.Type(2) << "\n"; +std::cout << info.Type(3) << "\n"; +@endcode + +@code +0 +0 +1 +0 +@endcode + +@section numofmappings Number of Mappings + +If the type is categorical each unique token will be mapped to an integer starting +with 0. + +\b NumMappings(size_t dimension) takes dimension as an argument and returns the number of +mappings in that dimension, if the dimension is number or there are no mappings then it +will return 0. + +@code +std::cout << info.NumMappings(0) << "\n"; +std::cout << info.NumMappings(1) << "\n"; +std::cout << info.NumMappings(2) << "\n"; +std::cout << info.NumMappings(3) << "\n"; +@endcode + +@code +0 +0 +2 +0 +@endcode + +@section checkmappings Check Mappings + +There are two ways to check the mappings. + - Enter the string to get mapped integer + - Enter the mapped integer to get string + +@subsection unmapstring UnmapString + +\b UnmapString(int value, size_t dimension, size_t unmappingIndex = 0UL) + - value is the integer for which you want to find the mapped value + - dimension is the dimension in which you want to check the mappings + +@code +std::cout << info.UnmapString(0, 2) << "\n"; +std::cout << info.UnmapString(1, 2) << "\n"; +@endcode + +@code +T +F +@endcode + +@subsection unmapvalue UnmapValue + +\b UnmapValue(const std::string &input, size_t dimension) + - input is the mapped value for which you want to find mapping + - dimension is the dimension in which you want to find the mapped value + +@code +std::cout << info.UnmapValue("T", 2) << "\n"; +std::cout << info.UnmapValue("F", 2) << "\n"; +@endcode + +@code +0 +1 +@endcode + +These are basic uses of DatasetMapper. Some advance use cases will be added soon. + +*/ From 933b68c2b30ca2bed06f1c6b41597674e117757e Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Mon, 23 Aug 2021 11:15:48 +0530 Subject: [PATCH 085/112] Handling empty line at the end --- src/mlpack/core/data/load_categorical_csv.hpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index e592e7586ba..6239e439fde 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -62,12 +62,10 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, cols = 0; std::string line; - while (std::getline(inFile, line)) + while (inFile.good()) { ++cols; - // Remove whitespaces from either side - trim(line); - + if (cols == 1) { // Extract the number of dimensions. @@ -88,6 +86,18 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, } } + std::getline(inFile, line); + // Remove whitespaces from either side + trim(line); + + // If it's an empty line decrease + // cols and break + if (line.size() == 0) + { + --cols; + continue; + } + // If we need to do a first pass for the DatasetMapper, do it. if (MapPolicy::NeedsFirstPass) { From 54d7824a2263f9e6948a6f7dae5d0a1901649853 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Wed, 1 Sep 2021 17:47:46 +0530 Subject: [PATCH 086/112] Style changes and fixs in tutorial --- doc/tutorials/data_loading/datasetmapper.txt | 52 ++++++++++++------- doc/tutorials/tutorials.txt | 1 + src/mlpack/core/data/detect_file_type.cpp | 2 +- src/mlpack/core/data/detect_file_type.hpp | 2 +- src/mlpack/core/data/load_categorical_csv.hpp | 19 ++++--- src/mlpack/core/data/load_csv.hpp | 12 ++--- src/mlpack/core/data/string_algorithms.hpp | 8 ++- src/mlpack/core/data/types.hpp | 8 ++- 8 files changed, 57 insertions(+), 47 deletions(-) diff --git a/doc/tutorials/data_loading/datasetmapper.txt b/doc/tutorials/data_loading/datasetmapper.txt index ef013b3550e..a786939e146 100644 --- a/doc/tutorials/data_loading/datasetmapper.txt +++ b/doc/tutorials/data_loading/datasetmapper.txt @@ -19,7 +19,7 @@ mapped to numeric features. A simple example can be 9,3,True,3 ``` -The above dataset will be respresented as +The above dataset will be represented as ``` 7,5,0,3 @@ -30,8 +30,16 @@ The above dataset will be respresented as Here Mappings are -- True mapped to 0 -- False mapped to 1 +- `True` mapped to `0` +- `False` mapped to `1` + +``` +**Note** DatasetMapper converts non-numeric values in the order +in which it encounters them in dataset. Therefore there is a chance that +`True` might get mapped to `0` if it encounters `True` before `False`. +This `0` and `1` are not to be confused with C++ bool notations. These +are mapping created by `mpack::DatasetMapper`. +``` DatasetMapper provides an easy API to load such data and stores all the necessary information of the dataset. @@ -52,14 +60,15 @@ A list of all sections @section load Loading data -To use \b DatasetMapper we have to call a specific overload of Load fucntion. +To use \b DatasetMapper we have to call a specific overload of `data::Load()` +fucntion. @code using namespace mlpack; arma::mat data; data::DatasetMapper info; -mlpack::data::Load("dataset.csv", data, info); +data::Load("dataset.csv", data, info); @endcode Dataset @@ -70,15 +79,18 @@ Dataset 9, 3, True, 3 ``` -@section dimensions Dimentionality +@section dimensions Dimensionality + +There are two ways to initialize a DatasetMapper object. + +* First is to initialize the object and set each property yourself. -There are two ways to initialize a DatasetMapper object. First is to initialize -the object and set each property yourself and second is to pass the object to -Load() in which case mlpack will populate the object. If we use the later option -then the dimentionality will be same as the number of rows in the dataset. +* Second is to pass the object to Load() in which case mlpack will populate +the object. If we use the latter option then the dimensionality will be same +as what's in the data file. @code -std::cout << info.Dimentionality(); +std::cout << info.Dimensionality(); @endcode @code @@ -88,15 +100,16 @@ std::cout << info.Dimentionality(); @section type Type of each Dimension Each dimension can be of either of the two types - - numeric - - categorical + - data::Datatype::numeric + - data::Datatype::categorical -\b Type(size_t dimension) takes an argument dimension which is basically the row +\c `Type(size_t dimension)` takes an argument dimension which is the row number for which you want to know the type -This will return either 0 or 1. - - 0 represents numeric - - 1 represents categorical +This will return an enum `data::Datatype`, which is casted to +`size_t` when we print them using `std::cout` + - 0 represents `data::Datatype::numeric` + - 1 represents `data::Datatype::categorical` @code std::cout << info.Type(0) << "\n"; @@ -114,8 +127,9 @@ std::cout << info.Type(3) << "\n"; @section numofmappings Number of Mappings -If the type is categorical each unique token will be mapped to an integer starting -with 0. +If the type of a dimention is `data::Datatype::categorical`, then during the +loading, each unique token in that dimension will be mapped to an integer +starting with 0. \b NumMappings(size_t dimension) takes dimension as an argument and returns the number of mappings in that dimension, if the dimension is number or there are no mappings then it diff --git a/doc/tutorials/tutorials.txt b/doc/tutorials/tutorials.txt index 6f6bb7356d9..1fda3371553 100644 --- a/doc/tutorials/tutorials.txt +++ b/doc/tutorials/tutorials.txt @@ -59,6 +59,7 @@ mlpack. - \ref bindings - \ref cv - \ref hpt_guide + - \ref datasetmapper @section policy_tut Policy Class Documentation diff --git a/src/mlpack/core/data/detect_file_type.cpp b/src/mlpack/core/data/detect_file_type.cpp index 583112cc0a9..89c7068ad25 100644 --- a/src/mlpack/core/data/detect_file_type.cpp +++ b/src/mlpack/core/data/detect_file_type.cpp @@ -35,7 +35,7 @@ std::string GetStringType(const FileType& type) case FileType::ArmaBinary: return "Armadillo binary formatted data"; case FileType::PGMBinary: return "PGM data"; case FileType::HDF5Binary: return "HDF5 data"; - default: return ""; + default: return ""; } } diff --git a/src/mlpack/core/data/detect_file_type.hpp b/src/mlpack/core/data/detect_file_type.hpp index 9c318bfb34f..14a9fc4a6d5 100644 --- a/src/mlpack/core/data/detect_file_type.hpp +++ b/src/mlpack/core/data/detect_file_type.hpp @@ -54,7 +54,7 @@ FileType GuessFileType(std::istream& f); * @return The detected file type. arma::file_type_unknown if unknown. */ FileType AutoDetect(std::fstream& stream, - const std::string& filename); + const std::string& filename); /** * Return the type based only on the extension. diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index 6239e439fde..eae324520d8 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -80,18 +80,17 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, { std::ostringstream oss; oss << "data::LoadCSV(): given DatasetInfo has dimensionality " - << info.Dimensionality() << ", but data has dimensionality " - << rows; + << info.Dimensionality() << ", but data has dimensionality " + << rows; throw std::invalid_argument(oss.str()); } } std::getline(inFile, line); - // Remove whitespaces from either side + // Remove whitespaces from either side trim(line); - // If it's an empty line decrease - // cols and break + // If it's an empty line decrease cols and break if (line.size() == 0) { --cols; @@ -160,8 +159,8 @@ void LoadCSV::InitializeMapper(size_t& rows, size_t& cols, DatasetMapper& inout, DatasetMapper& inf template void LoadCSV::NonTransposeParse(arma::Mat& inout, - DatasetMapper& infoSet) + DatasetMapper& infoSet) { // Get the size of the matrix. size_t rows, cols; @@ -336,8 +335,8 @@ void LoadCSV::NonTransposeParse(arma::Mat& inout, { std::ostringstream oss; oss << "LoadCSV::NonTransposeParse(): wrong number of dimensions (" - << col << ") on line " << row << "; should be " << cols - << " dimensions."; + << col << ") on line " << row << "; should be " << cols + << " dimensions."; throw std::runtime_error(oss.str()); } ++row; col = 0; diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index eec3c1b66be..8872ee9b46f 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -214,8 +214,8 @@ class LoadCSV f.clear(); const std::fstream::pos_type pos1 = f.tellg(); - size_t f_n_rows = 0; - size_t f_n_cols = 0; + size_t fnRows = 0; + size_t fnCols = 0; std::string lineString; std::stringstream lineStream; std::string token; @@ -241,16 +241,16 @@ class LoadCSV // row, then the highest number of cols will be // considered as the size of the matrix. Missing // elements will be filled as 0 - if (f_n_cols < line_n_cols) - f_n_cols = line_n_cols; + if (fnCols < line_n_cols) + fnCols = line_n_cols; - ++f_n_rows; + ++fnRows; } f.clear(); f.seekg(pos1); - std::pair mat_size(f_n_rows, f_n_cols); + std::pair mat_size(fnRows, fnCols); return mat_size; } diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 5d5d44019b6..872667ad9f0 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -13,8 +13,8 @@ #ifndef MLPACK_CORE_DATA_STRING_ALGORITHMS_HPP #define MLPACK_CORE_DATA_STRING_ALGORITHMS_HPP -namespace mlpack{ -namespace data{ +namespace mlpack { +namespace data { /** * A simple trim fucntion to strip off whitespaces @@ -97,6 +97,4 @@ inline void trim_if(std::string &str, std::function func) } // namespace data } // namespace mlpack -#endif - - +#endif diff --git a/src/mlpack/core/data/types.hpp b/src/mlpack/core/data/types.hpp index 0a0de300ff6..05db914b28c 100644 --- a/src/mlpack/core/data/types.hpp +++ b/src/mlpack/core/data/types.hpp @@ -22,10 +22,8 @@ #include #include -namespace mlpack -{ -namespace data -{ +namespace mlpack { +namespace data { enum struct FileType { @@ -39,7 +37,7 @@ enum struct FileType PGMBinary, //!< Portable Grey Map (greyscale image) PPMBinary, //!< Portable Pixel Map (colour image), used by the field and cube classes HDF5Binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data - CoordASCII //!< simple co-ordinate format for sparse matrices (indices start at zero) + CoordASCII //!< simple co-ordinate format for sparse matrices (indices start at zero) }; /** From 7904f39606129dce0044ef9068c7fc20c6abe4a5 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Thu, 2 Sep 2021 23:56:57 +0530 Subject: [PATCH 087/112] Some typos --- src/mlpack/core/data/load_csv.hpp | 4 ++-- src/mlpack/core/data/string_algorithms.hpp | 4 ++-- src/mlpack/core/data/types.hpp | 4 ++-- src/mlpack/core/data/types_impl.hpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 8872ee9b46f..062d6eff4c9 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -130,7 +130,7 @@ class LoadCSV /** * Caluculate number of columns in each row - * and assign the value to the col. This fucntion + * and assign the value to the col. This function * will work only for numeric data. * * @param lineStream a single row of data @@ -199,7 +199,7 @@ class LoadCSV // Functions common to both numeric & categorical parser /** - * Get the size of the matrix. Based on isNumeric the fucntion can be used + * Get the size of the matrix. Based on isNumeric the function can be used * for both numeric_parse and categorical_parse. * * @param f fstream stream to open the data file diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 872667ad9f0..2952a8c81a3 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -17,7 +17,7 @@ namespace mlpack { namespace data { /** - * A simple trim fucntion to strip off whitespaces + * A simple trim function to strip off whitespaces * from both the side of string. If input is a string * with all spaces then str will be empty string. * @@ -55,7 +55,7 @@ inline void trim(std::string& str) * be trimmed off. * * @param str string to be trimmed - * @param func fucntion to determine the characters which should be trimmed + * @param func function to determine the characters which should be trimmed */ inline void trim_if(std::string &str, std::function func) { diff --git a/src/mlpack/core/data/types.hpp b/src/mlpack/core/data/types.hpp index 05db914b28c..b67f1f012fd 100644 --- a/src/mlpack/core/data/types.hpp +++ b/src/mlpack/core/data/types.hpp @@ -2,12 +2,12 @@ * @file core/data/types.hpp * @author Gopi M. Tatiraju * - * This file contains utilitiy fucntions related to types of data. + * This file contains utilitiy functions related to types of data. * We have adapted all the standard types which are available in armadillo. * * This file also contains functions to convery external file types to mlpack * file types. In future if we the user of mlpack needs support of an external - * linear algebra library like armadillo, all fucntions related to handling the + * linear algebra library like armadillo, all functions related to handling the * types goes here. * * mlpack is free software; you may redistribute it and/or modify it under the diff --git a/src/mlpack/core/data/types_impl.hpp b/src/mlpack/core/data/types_impl.hpp index f3a6eaea674..18fd0a4a634 100644 --- a/src/mlpack/core/data/types_impl.hpp +++ b/src/mlpack/core/data/types_impl.hpp @@ -2,12 +2,12 @@ * @file core/data/types.hpp * @author Gopi M. Tatiraju * - * This file contains utilitiy fucntions related to types of data. + * This file contains utilitiy functions related to types of data. * We have adapted all the standard types which are available in armadillo. * * This file also contains functions to convery external file types to mlpack * file types. In future if we the user of mlpack needs support of an external - * linear algebra library like armadillo, all fucntions related to handling the + * linear algebra library like armadillo, all functions related to handling the * types goes here. * * mlpack is free software; you may redistribute it and/or modify it under the From 128da6b0eba40a36133f685715462587d837cb37 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Sat, 4 Sep 2021 03:43:32 +0530 Subject: [PATCH 088/112] Adding condition in case ConvertToken() fails --- src/mlpack/core/data/load_numeric_csv.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index 774680e3ec3..249c719ccf2 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -64,7 +64,7 @@ bool LoadCSV::ConvertToken(eT& val, char* endptr = nullptr; - // Convert the token into ccorrect type. + // Convert the token into correct type. // If we have a eT as unsigned int, // it will convert all negative numbers to 0 if (std::is_floating_point::value) @@ -85,6 +85,10 @@ bool LoadCSV::ConvertToken(eT& val, val = eT(std::strtoull(str, &endptr, 10)); } } + // None of the above conditions were executed + // So conversion failed + else + return false; if (str == endptr) return false; From c2151d7737d604b026343efeb0f5ae135e4fbc70 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Mon, 20 Sep 2021 11:40:19 +0530 Subject: [PATCH 089/112] More style issues --- src/mlpack/core/data/load_csv.hpp | 57 +++++++++++----------- src/mlpack/core/data/load_numeric_csv.hpp | 12 ++--- src/mlpack/core/data/string_algorithms.hpp | 8 +-- src/mlpack/core/data/types.hpp | 2 +- 4 files changed, 39 insertions(+), 40 deletions(-) diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 062d6eff4c9..734c27e426c 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -74,7 +74,7 @@ class LoadCSV * rules necessary for loading and will attempt to open the file. This will also * initialize the delimiter character for parsing. * - * @param file path of the dataset + * @param file path of the dataset. */ LoadCSV(const std::string& file) : extension(Extension(file)), @@ -108,8 +108,8 @@ class LoadCSV * the second pass, the function converts each value to required datatype * and sets it equal to val. * - * @param x Matrix in which data will be loaded - * @param f File stream to access the data file + * @param x Matrix in which data will be loaded. + * @param f File stream to access the data file. */ template bool LoadNumericCSV(arma::Mat& x, std::fstream& f); @@ -122,25 +122,25 @@ class LoadCSV * Token is always read as a string, if the given token is +/-INF or NAN * it converts them to infinity and NAN using numeric_limits. * - * @param val Token's value will be assigned to this address - * @param token Value which should be assigned + * @param val Token's value will be assigned to this address. + * @param token Value which should be assigned. */ template bool ConvertToken(eT& val, const std::string& token); /** - * Caluculate number of columns in each row + * Calculate the number of columns in each row * and assign the value to the col. This function * will work only for numeric data. * - * @param lineStream a single row of data - * @param col number of columns in lineStream - * @param delim delimiter character + * @param lineStream a single row of data. + * @param col number of columns in lineStream. + * @param delim delimiter character. */ inline void NumericMatSize(std::stringstream& lineStream, size_t& col, const char delim); - // Functions for Categorical Parse + // Functions for Categorical Parse. /** * Load the file into the given matrix with the given DatasetMapper object. @@ -148,8 +148,7 @@ class LoadCSV * * @param inout Matrix to load into. * @param infoSet DatasetMapper to use while loading. - * @param transpose If true, the matrix should be transposed on loading - * (default). + * @param transpose If true, the matrix should be transposed on loading(default). */ template void LoadCategoricalCSV(arma::Mat &inout, @@ -185,25 +184,25 @@ class LoadCSV DatasetMapper& info); /** - * Calculate number of columns in each row + * Calculate the number of columns in each row * and assign the value to the col. This function * will work for categorical data. * - * @param lineStream a single row of data - * @param col number of columns in lineStream - * @param delim delimiter character + * @param lineStream a single row of data. + * @param col the number of columns in lineStream. + * @param delim the delimiter character. */ inline void CategoricalMatSize(std::stringstream& lineStream, size_t& col, const char delim); - // Functions common to both numeric & categorical parser + // Functions common to both numeric & categorical parser. /** * Get the size of the matrix. Based on isNumeric the function can be used * for both numeric_parse and categorical_parse. * - * @param f fstream stream to open the data file - * @param delim char delimiter charecter + * @param f fstream stream to open the data file. + * @param delim char delimiter charecter. */ template inline std::pair GetMatrixSize(std::fstream& f, @@ -222,7 +221,7 @@ class LoadCSV while (f.good() && load_okay) { - // Get a row of data + // Get a row of data. std::getline(f, lineString); if (lineString.size() == 0) break; @@ -231,7 +230,7 @@ class LoadCSV lineStream.str(lineString); size_t line_n_cols = 0; - // Get number of columns based on the type of data + // Get number of columns based on the type of data. if (isNumeric) NumericMatSize(lineStream, line_n_cols, delim); else @@ -240,7 +239,7 @@ class LoadCSV // If there are different number of columns in each // row, then the highest number of cols will be // considered as the size of the matrix. Missing - // elements will be filled as 0 + // elements will be filled as 0. if (fnCols < line_n_cols) fnCols = line_n_cols; @@ -264,25 +263,25 @@ class LoadCSV */ inline void CheckOpen() { - // check if file is opening + // Check if the file is opening. if (!inFile.is_open()) { std::ostringstream oss; oss << "Cannot open file '" << filename << "'. " << std::endl; - // throw an exception if file is not opening + // Throw an exception if the file is not opening. throw std::runtime_error(oss.str()); } - // clear format flag + // Clear format flag. inFile.unsetf(std::ios::skipws); } - // Fucntions for Categorical Parse + // Functions for Categorical Parse. /** * Parse a non-transposed matrix. * - * @param inout Matrix to load into. + * @param input Matrix to load into. * @param infoSet DatasetMapper object to load with. */ template @@ -292,7 +291,7 @@ class LoadCSV /** * Parse a transposed matrix. * - * @param inout Matrix to load into. + * @param input Matrix to load into. * @param infoSet DatasetMapper to load with. */ template @@ -304,7 +303,7 @@ class LoadCSV std::string filename; //! Opened stream for reading. std::fstream inFile; - //! Delimiter char + //! Delimiter char. char delim; }; diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index 249c719ccf2..985c3bd41bf 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -33,7 +33,7 @@ bool LoadCSV::ConvertToken(eT& val, // Checks for +/-INF and NAN // Converts them to their equivalent representation - // from numeric_limits + // from numeric_limits. if ((N == 3) || (N == 4)) { const bool neg = (str[0] == '-'); @@ -66,7 +66,7 @@ bool LoadCSV::ConvertToken(eT& val, // Convert the token into correct type. // If we have a eT as unsigned int, - // it will convert all negative numbers to 0 + // it will convert all negative numbers to 0. if (std::is_floating_point::value) { val = eT(std::strtod(str, &endptr)); @@ -85,8 +85,8 @@ bool LoadCSV::ConvertToken(eT& val, val = eT(std::strtoull(str, &endptr, 10)); } } - // None of the above conditions were executed - // So conversion failed + // If none of the above conditions was executed, + // then the conversion will fail. else return false; @@ -111,7 +111,7 @@ bool LoadCSV::LoadNumericCSV(arma::Mat& x, std::fstream& f) while (f.good()) { - // Parse the file line by line + // Parse the file line by line. std::getline(f, lineString); if (lineString.size() == 0) @@ -124,7 +124,7 @@ bool LoadCSV::LoadNumericCSV(arma::Mat& x, std::fstream& f) while (lineStream.good()) { - // Parse each line + // Parse each line. std::getline(lineStream, token, ','); // This will handle loading of both dense and sparse. diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 2952a8c81a3..3dcb6460fb0 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -21,7 +21,7 @@ namespace data { * from both the side of string. If input is a string * with all spaces then str will be empty string. * - * @param str string to be trimmed + * @param str the string to be trimmed. */ inline void trim(std::string& str) { @@ -54,8 +54,8 @@ inline void trim(std::string& str) * used to determine which characters will * be trimmed off. * - * @param str string to be trimmed - * @param func function to determine the characters which should be trimmed + * @param str the string to be trimmed. + * @param func function to determine the characters which should be trimmed. */ inline void trim_if(std::string &str, std::function func) { @@ -79,7 +79,7 @@ inline void trim_if(std::string &str, std::function func) size_t endIndex = str.size() - 1; - for(int i = str.size() - 1; i >= 0; i--) + for (int i = str.size() - 1; i >= 0; i--) { bool match = func(str[i]); if (match) diff --git a/src/mlpack/core/data/types.hpp b/src/mlpack/core/data/types.hpp index b67f1f012fd..ac1660eb88a 100644 --- a/src/mlpack/core/data/types.hpp +++ b/src/mlpack/core/data/types.hpp @@ -44,7 +44,7 @@ enum struct FileType * This function is used to convert mlpack file types to * their respective Armadillo file types. * - * @param type mlpack::FileType + * @param type mlpack::FileType. */ inline arma::file_type ToArmaFileType(const FileType& type); From 1537e86fe0accd703eadaa3c81a321a5efb6cab3 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Mon, 18 Oct 2021 07:27:19 +0530 Subject: [PATCH 090/112] Handling failing of ConvertToken() --- src/mlpack/core/data/load_numeric_csv.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index 985c3bd41bf..6fc3c5b6a05 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -136,6 +136,14 @@ bool LoadCSV::LoadNumericCSV(arma::Mat& x, std::fstream& f) x.at(row, col) = tmp_val; ++col; } + else + { + // Printing failed token and it's location. + Log::Warn << "Failed to convert token " << token << ", at row " << row << ", column " + << col << " of matrix!"; + + return false; + } } ++row; } From 67325c59e5df7da2feb9587e59914b615de520c5 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Mon, 18 Oct 2021 07:32:27 +0530 Subject: [PATCH 091/112] Adding comment --- src/mlpack/core/data/load_numeric_csv.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index 6fc3c5b6a05..e5e85c120a9 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -90,6 +90,9 @@ bool LoadCSV::ConvertToken(eT& val, else return false; + // If any of strtod() or strtoll() fails, str will + // be set to nullptr and this condition will be + // executed. if (str == endptr) return false; From 73d37db0a0a53a13623df66db57b9798eec57916 Mon Sep 17 00:00:00 2001 From: Gopi M Tatiraju Date: Fri, 22 Oct 2021 12:37:30 +0530 Subject: [PATCH 092/112] Apply suggestions from code review Style fix Co-authored-by: Ryan Curtin --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- src/mlpack/core/data/load_numeric_csv.hpp | 36 ++++++++++--------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index eae324520d8..a9e5954dda2 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -270,7 +270,7 @@ void LoadCSV::TransposeParse(arma::Mat& inout, DatasetMapper& inf { std::ostringstream oss; oss << "LoadCSV::TransposeParse(): wrong number of dimensions (" << row - << ") on line " << col << "; should be " << rows << " dimensions."; + << ") on line " << col << "; should be " << rows << " dimensions."; throw std::runtime_error(oss.str()); } // Increment the column index. diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index e5e85c120a9..7fb9541a36d 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -41,21 +41,21 @@ bool LoadCSV::ConvertToken(eT& val, const size_t offset = ((neg || pos) && (N == 4)) ? 1 : 0; - const char sig_a = str[offset]; - const char sig_b = str[offset+1]; - const char sig_c = str[offset+2]; + const char sigA = str[offset]; + const char sigB = str[offset + 1]; + const char sigC = str[offset + 2]; - if (((sig_a == 'i') || (sig_a == 'I')) && - ((sig_b == 'n') || (sig_b == 'N')) && - ((sig_c == 'f') || (sig_c == 'F'))) + if (((sigA == 'i') || (sigA == 'I')) && + ((sigB == 'n') || (sigB == 'N')) && + ((sigC == 'f') || (sigC == 'F'))) { val = neg ? -(std::numeric_limits ::infinity()) : std::numeric_limits::infinity(); return true; } - else if (((sig_a == 'n') || (sig_a == 'N')) && - ((sig_b == 'a') || (sig_b == 'A')) && - ((sig_c == 'n') || (sig_c == 'N'))) + else if (((sigA == 'n') || (sigA == 'N')) && + ((sigB == 'a') || (sigB == 'A')) && + ((sigC == 'n') || (sigC == 'N'))) { val = std::numeric_limits::quiet_NaN(); return true; @@ -102,7 +102,7 @@ bool LoadCSV::ConvertToken(eT& val, template bool LoadCSV::LoadNumericCSV(arma::Mat& x, std::fstream& f) { - bool load_okay = f.good(); + bool loadOkay = f.good(); f.clear(); std::pair mat_size = GetMatrixSize(f); x.zeros(mat_size.first, mat_size.second); @@ -132,28 +132,30 @@ bool LoadCSV::LoadNumericCSV(arma::Mat& x, std::fstream& f) // This will handle loading of both dense and sparse. // Initialize tmp_val of type eT with value 0. - eT tmp_val = eT(0); + eT tmpVal = eT(0); - if (ConvertToken(tmp_val, token)) + if (ConvertToken(tmpVal, token)) { - x.at(row, col) = tmp_val; + x.at(row, col) = tmpVal; ++col; } else { // Printing failed token and it's location. - Log::Warn << "Failed to convert token " << token << ", at row " << row << ", column " - << col << " of matrix!"; + Log::Warn << "Failed to convert token " << token << ", at row " << row + << ", column " << col << " of matrix!"; return false; } } ++row; } - return load_okay; + return loadOkay; } -inline void LoadCSV::NumericMatSize(std::stringstream& lineStream, size_t& col, const char delim) +inline void LoadCSV::NumericMatSize(std::stringstream& lineStream, + size_t& col, + const char delim) { std::string token; while (lineStream.good()) From cc918557ec40d88b20f76d4568bf75fbfc286d2a Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Tue, 26 Oct 2021 12:58:47 +0530 Subject: [PATCH 093/112] More style changes --- src/mlpack/core/data/detect_file_type.cpp | 2 +- src/mlpack/core/data/load_arff_impl.hpp | 12 +++++------ src/mlpack/core/data/load_categorical_csv.hpp | 16 +++++++-------- src/mlpack/core/data/load_csv.hpp | 20 +++++++++---------- src/mlpack/core/data/load_impl.hpp | 2 +- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/mlpack/core/data/detect_file_type.cpp b/src/mlpack/core/data/detect_file_type.cpp index 89c7068ad25..0219f95074c 100644 --- a/src/mlpack/core/data/detect_file_type.cpp +++ b/src/mlpack/core/data/detect_file_type.cpp @@ -211,7 +211,7 @@ FileType AutoDetect(std::fstream& stream, const std::string& filename) const std::streampos pos = stream.tellg(); std::string line; std::getline(stream, line, '\n'); - trim(line); + Trim(line); // Reset stream position. stream.seekg(pos); diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 280eb16cb19..33be092cfa2 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -46,7 +46,7 @@ void LoadARFF(const std::string& filename, { // Read the next line, then strip whitespace from either side. std::getline(ifs, line, '\n'); - trim(line); + Trim(line); ++headerLines; // Is the first character a comment, or is the line empty? @@ -102,7 +102,7 @@ void LoadARFF(const std::string& filename, // `origDimType` string here instead (which has not had ::tolower used // on it). types.push_back(true); - trim_if(origDimType, + TrimIf(origDimType, [](char c) { return c == '{' || c == '}' || c == ' ' || c == '\t'; @@ -116,7 +116,7 @@ void LoadARFF(const std::string& filename, while (it != dimTok.end()) { std::string category = (*it); - trim(category); + Trim(category); categories.push_back(category); ++it; @@ -198,7 +198,7 @@ void LoadARFF(const std::string& filename, while (ifs.good()) { std::getline(ifs, line, '\n'); - trim(line); + Trim(line); // Each line of the @data section must be a CSV (except sparse data, which // we will handle later). So now we can tokenize the // CSV and parse it. The '?' representing a missing value is not allowed, @@ -232,7 +232,7 @@ void LoadARFF(const std::string& filename, { // Strip spaces before mapping. std::string token = *it; - trim(token); + Trim(token); const size_t currentNumMappings = info.NumMappings(col); const eT result = info.template MapString(token, col); @@ -272,7 +272,7 @@ void LoadARFF(const std::string& filename, // error, otherwise we issue a general error. std::stringstream error; std::string tokenStr = token.str(); - trim(tokenStr); + Trim(tokenStr); if (tokenStr == "?") error << "Missing values ('?') not supported, "; else diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index a9e5954dda2..844611d1cf5 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -88,7 +88,7 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, std::getline(inFile, line); // Remove whitespaces from either side - trim(line); + Trim(line); // If it's an empty line decrease cols and break if (line.size() == 0) @@ -112,7 +112,7 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, { std::getline(lineStream, token, delim); // Remove whitespace from either side - trim(token); + Trim(token); if (token[0] == '"' && token[token.size() - 1] != '"') { @@ -173,7 +173,7 @@ void LoadCSV::InitializeMapper(size_t& rows, size_t& cols, DatasetMapper& inout, DatasetMapper& inf while (std::getline(inFile, line)) { // Remove whitespaces from either side - trim(line); + Trim(line); // Reset the row we are looking at. (Remember this is transposed.) row = 0; std::stringstream lineStream; @@ -247,7 +247,7 @@ void LoadCSV::TransposeParse(arma::Mat& inout, DatasetMapper& inf { std::getline(lineStream, token, delim); // Remove whitespaces from either side - trim(token); + Trim(token); if (token[0] == '"' && token[token.size() - 1] != '"') { @@ -299,7 +299,7 @@ void LoadCSV::NonTransposeParse(arma::Mat& inout, while (std::getline(inFile, line)) { // Remove whitespaces from either side - trim(line); + Trim(line); std::stringstream lineStream; std::string token; @@ -314,7 +314,7 @@ void LoadCSV::NonTransposeParse(arma::Mat& inout, std::getline(lineStream, token, delim); // Remove whitespace from either side - trim(token); + Trim(token); if (token[0] == '"' && token[token.size() - 1] != '"') { diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index 734c27e426c..e0eb527665a 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -77,9 +77,9 @@ class LoadCSV * @param file path of the dataset. */ LoadCSV(const std::string& file) : - extension(Extension(file)), - filename(file), - inFile(file) + extension(Extension(file)), + filename(file), + inFile(file) { if (extension == "csv") { @@ -208,7 +208,7 @@ class LoadCSV inline std::pair GetMatrixSize(std::fstream& f, const char delim = ',') { - bool load_okay = f.good(); + bool loadOkay = f.good(); f.clear(); const std::fstream::pos_type pos1 = f.tellg(); @@ -219,7 +219,7 @@ class LoadCSV std::stringstream lineStream; std::string token; - while (f.good() && load_okay) + while (f.good() && loadOkay) { // Get a row of data. std::getline(f, lineString); @@ -228,20 +228,20 @@ class LoadCSV lineStream.clear(); lineStream.str(lineString); - size_t line_n_cols = 0; + size_t lineNCols = 0; // Get number of columns based on the type of data. if (isNumeric) - NumericMatSize(lineStream, line_n_cols, delim); + NumericMatSize(lineStream, lineNCols, delim); else - CategoricalMatSize(lineStream, line_n_cols, delim); + CategoricalMatSize(lineStream, lineNCols, delim); // If there are different number of columns in each // row, then the highest number of cols will be // considered as the size of the matrix. Missing // elements will be filled as 0. - if (fnCols < line_n_cols) - fnCols = line_n_cols; + if (fnCols < lineNCols) + fnCols = lineNCols; ++fnRows; } diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 7b9f8eac909..226960a7e03 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -39,7 +39,7 @@ std::vector ToTokens(Tokenizer& lineTok) [&tokens](std::string const &str) { std::string trimmedToken(str); - trim(trimmedToken); + Trim(trimmedToken); return std::move(trimmedToken); }); From d986e3dc3b01ff7f6b704e4b36e96039e983486c Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Tue, 26 Oct 2021 12:59:36 +0530 Subject: [PATCH 094/112] Replacing ternary operator with simple if/else block --- src/mlpack/core/data/string_algorithms.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index 3dcb6460fb0..b4414d5a739 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -23,7 +23,7 @@ namespace data { * * @param str the string to be trimmed. */ -inline void trim(std::string& str) +inline void Trim(std::string& str) { if (str.find_first_not_of(' ') == std::string::npos) { @@ -57,7 +57,7 @@ inline void trim(std::string& str) * @param str the string to be trimmed. * @param func function to determine the characters which should be trimmed. */ -inline void trim_if(std::string &str, std::function func) +inline void TrimIf(std::string &str, std::function func) { if (str.find_first_not_of(' ') == std::string::npos) { @@ -88,8 +88,10 @@ inline void trim_if(std::string &str, std::function func) break; } - std::string trimmedStr = (endIndex - startIndex == str.size()) ? - std::move(str) : str.substr(startIndex, endIndex - startIndex + 1); + if (endIndex - startIndex == str.size()) + trimmedStr = std::move(str); + else + trimmedStr = str.substr(startIndex, endIndex - startIndex + 1); str = trimmedStr; } From 5cade8d33750f9647e8a930488a481a85488db4d Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Tue, 26 Oct 2021 13:26:30 +0530 Subject: [PATCH 095/112] minor bug --- src/mlpack/core/data/string_algorithms.hpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index b4414d5a739..ef01815af03 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -41,9 +41,12 @@ inline void Trim(std::string& str) while (std::isspace(str[endIndex])) endIndex--; - std::string trimmedStr = (endIndex - startIndex == str.size()) ? - std::move(str) : str.substr(startIndex, - endIndex - startIndex + 1); + std::string trimmedStr; + + if (endIndex - startIndex == str.size()) + trimmedStr = std::move(str); + else + trimmedStr = str.substr(startIndex, endIndex - startIndex + 1); str = trimmedStr; } @@ -88,6 +91,8 @@ inline void TrimIf(std::string &str, std::function func) break; } + std::string trimmedStr; + if (endIndex - startIndex == str.size()) trimmedStr = std::move(str); else From 49dce56b99f938bbb247e2dc1fe1188a55513ee8 Mon Sep 17 00:00:00 2001 From: heisenbuug Date: Tue, 2 Nov 2021 22:04:44 +0530 Subject: [PATCH 096/112] Adding comment --- src/mlpack/core/data/string_algorithms.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index ef01815af03..c1b6df71f1e 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -43,6 +43,10 @@ inline void Trim(std::string& str) std::string trimmedStr; + // Using ternary operator is not recommended here. + // Ternary operator is only useful for simple expressions + // that don't involve varrying types. + // https://en.cppreference.com/w/cpp/language/operator_other if (endIndex - startIndex == str.size()) trimmedStr = std::move(str); else @@ -93,6 +97,10 @@ inline void TrimIf(std::string &str, std::function func) std::string trimmedStr; + // Using ternary operator is not recommended here. + // Ternary operator is only useful for simple expressions + // that don't involve varrying types. + // https://en.cppreference.com/w/cpp/language/operator_other if (endIndex - startIndex == str.size()) trimmedStr = std::move(str); else From 0307574f444018e09eabd18e6f7dd34751b6788f Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:14:06 +0100 Subject: [PATCH 097/112] Update src/mlpack/core/data/string_algorithms.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/string_algorithms.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp index c1b6df71f1e..5bc3291733e 100644 --- a/src/mlpack/core/data/string_algorithms.hpp +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -18,7 +18,7 @@ namespace data { /** * A simple trim function to strip off whitespaces - * from both the side of string. If input is a string + * from both the sides of a string. If input is a string * with all spaces then str will be empty string. * * @param str the string to be trimmed. From 967a7c9498f95b2213cfa8f6213605a28513fc58 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:14:14 +0100 Subject: [PATCH 098/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index 844611d1cf5..150600df173 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -313,7 +313,7 @@ void LoadCSV::NonTransposeParse(arma::Mat& inout, token.clear(); std::getline(lineStream, token, delim); - // Remove whitespace from either side + // Remove whitespace from either side. Trim(token); if (token[0] == '"' && token[token.size() - 1] != '"') From 2cba1d6f1580e3f6b0e6c0341196fc982e616cd0 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:14:21 +0100 Subject: [PATCH 099/112] Update src/mlpack/core/data/load_numeric_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_numeric_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp index 7fb9541a36d..a43bdd02de3 100644 --- a/src/mlpack/core/data/load_numeric_csv.hpp +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -22,7 +22,7 @@ bool LoadCSV::ConvertToken(eT& val, const std::string& token) { const size_t N = size_t(token.length()); - // Fill empty data points with 0 + // Fill empty data points with 0. if (N == 0) { val = eT(0); From ce36efb41d8b597af382ea3c52546983f20ea1f2 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:14:29 +0100 Subject: [PATCH 100/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index 150600df173..d3401bf9adb 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -298,7 +298,7 @@ void LoadCSV::NonTransposeParse(arma::Mat& inout, while (std::getline(inFile, line)) { - // Remove whitespaces from either side + // Remove whitespaces from either side. Trim(line); std::stringstream lineStream; From eec152f8037befb1272396436e688d08ee83832b Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:14:37 +0100 Subject: [PATCH 101/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index d3401bf9adb..98d6e032d27 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -87,7 +87,7 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, } std::getline(inFile, line); - // Remove whitespaces from either side + // Remove whitespaces from either side. Trim(line); // If it's an empty line decrease cols and break From 14edebbda4c85f790efb2eb17744c52815609ef6 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:14:47 +0100 Subject: [PATCH 102/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index 98d6e032d27..eadddd2347d 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -90,7 +90,7 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, // Remove whitespaces from either side. Trim(line); - // If it's an empty line decrease cols and break + // If it's an empty line decrease cols and break. if (line.size() == 0) { --cols; From 1c8e3bb1c44cdf42b7d34802baf62ebf0478f4d5 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:14:53 +0100 Subject: [PATCH 103/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index eadddd2347d..2cb09cfbdaf 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -51,7 +51,7 @@ void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, DatasetMapper& info) { // Take a pass through the file. If the DatasetMapper policy requires it, - // we will pass everything string through MapString(). This might be useful + // we will pass everything as string through MapString(). This might be useful // if, e.g., the MapPolicy needs to find which dimensions are numeric or // categorical. From 99def5a1b5a9461d82ccb0814f5df131bf7732f7 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:15:00 +0100 Subject: [PATCH 104/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index 2cb09cfbdaf..e380320a403 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -30,7 +30,8 @@ void LoadCSV::LoadCategoricalCSV(arma::Mat &inout, NonTransposeParse(inout, infoSet); } -inline void LoadCSV::CategoricalMatSize(std::stringstream& lineStream, size_t& col, const char delim) +inline void LoadCSV::CategoricalMatSize( + std::stringstream& lineStream, size_t& col, const char delim) { std::string token; while (lineStream.good()) From 4f7df5893d9402ac44d3e6c89f77dbb37e87692e Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:15:05 +0100 Subject: [PATCH 105/112] Update doc/tutorials/data_loading/datasetmapper.txt Co-authored-by: Marcus Edel --- doc/tutorials/data_loading/datasetmapper.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorials/data_loading/datasetmapper.txt b/doc/tutorials/data_loading/datasetmapper.txt index a786939e146..ac73095740a 100644 --- a/doc/tutorials/data_loading/datasetmapper.txt +++ b/doc/tutorials/data_loading/datasetmapper.txt @@ -132,7 +132,7 @@ loading, each unique token in that dimension will be mapped to an integer starting with 0. \b NumMappings(size_t dimension) takes dimension as an argument and returns the number of -mappings in that dimension, if the dimension is number or there are no mappings then it +mappings in that dimension, if the dimension is a number or there are no mappings then it will return 0. @code From 6d63b15e2656ff729453881f5b81822092f6c051 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:15:12 +0100 Subject: [PATCH 106/112] Update doc/tutorials/data_loading/datasetmapper.txt Co-authored-by: Marcus Edel --- doc/tutorials/data_loading/datasetmapper.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorials/data_loading/datasetmapper.txt b/doc/tutorials/data_loading/datasetmapper.txt index ac73095740a..bbd16fb76a5 100644 --- a/doc/tutorials/data_loading/datasetmapper.txt +++ b/doc/tutorials/data_loading/datasetmapper.txt @@ -127,7 +127,7 @@ std::cout << info.Type(3) << "\n"; @section numofmappings Number of Mappings -If the type of a dimention is `data::Datatype::categorical`, then during the +If the type of a dimension is `data::Datatype::categorical`, then during loading, each unique token in that dimension will be mapped to an integer starting with 0. From 2b2c7fa950e9f0e751a89a6eb27609a2e8d912b5 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:15:23 +0100 Subject: [PATCH 107/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index e380320a403..e98b02f7a07 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -136,7 +136,7 @@ template void LoadCSV::InitializeMapper(size_t& rows, size_t& cols, DatasetMapper& info) { // Take a pass through the file. If the DatasetMapper policy requires it, - // we will pass everything string through MapString(). This might be useful + // we will pass everything as string through MapString(). This might be useful // if, e.g., the MapPolicy needs to find which dimensions are numeric or // categorical. From 5e8afca1a99dfef78f2d44c81a494a3d9350eb69 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:15:32 +0100 Subject: [PATCH 108/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index e98b02f7a07..33993dcba37 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -173,7 +173,7 @@ void LoadCSV::InitializeMapper(size_t& rows, size_t& cols, DatasetMapper Date: Fri, 5 Nov 2021 20:15:44 +0100 Subject: [PATCH 109/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index 33993dcba37..ccc5263bd7d 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -235,7 +235,7 @@ void LoadCSV::TransposeParse(arma::Mat& inout, DatasetMapper& inf while (std::getline(inFile, line)) { - // Remove whitespaces from either side + // Remove whitespaces from either side. Trim(line); // Reset the row we are looking at. (Remember this is transposed.) row = 0; From 72c989d82bcfc4630486ca5812922316153f0e4f Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:15:53 +0100 Subject: [PATCH 110/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index ccc5263bd7d..aa4f2977b97 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -196,7 +196,7 @@ void LoadCSV::InitializeMapper(size_t& rows, size_t& cols, DatasetMapper Date: Fri, 5 Nov 2021 20:16:00 +0100 Subject: [PATCH 111/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index aa4f2977b97..da1956b1d44 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -247,7 +247,7 @@ void LoadCSV::TransposeParse(arma::Mat& inout, DatasetMapper& inf while (lineStream.good()) { std::getline(lineStream, token, delim); - // Remove whitespaces from either side + // Remove whitespaces from either side. Trim(token); if (token[0] == '"' && token[token.size() - 1] != '"') From 4eb946863d46dc81b1b380ea94dc9018650e655c Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 5 Nov 2021 20:16:13 +0100 Subject: [PATCH 112/112] Update src/mlpack/core/data/load_categorical_csv.hpp Co-authored-by: Marcus Edel --- src/mlpack/core/data/load_categorical_csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp index da1956b1d44..87d6b41adcc 100644 --- a/src/mlpack/core/data/load_categorical_csv.hpp +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -252,7 +252,7 @@ void LoadCSV::TransposeParse(arma::Mat& inout, DatasetMapper& inf if (token[0] == '"' && token[token.size() - 1] != '"') { - // first part of the string + // First part of the string. std::string tok = token; while (token[token.size() - 1] != '"') {