diff --git a/doc/tutorials/data_loading/datasetmapper.txt b/doc/tutorials/data_loading/datasetmapper.txt new file mode 100644 index 00000000000..bbd16fb76a5 --- /dev/null +++ b/doc/tutorials/data_loading/datasetmapper.txt @@ -0,0 +1,192 @@ +/*! + +@file datasetmapper.txt +@author Gopi Tatiraju +@breif Introduction and tutorial for how to use DatasetMapper in mlpack. + +@page datasetmapper DatasetMapper Tutorial + +@section intro_datasetmapper Introduction + +DatasetMapper is a class which holds information about a dataset. This can be +used when dataset contains categorical non-numeric features which should be +mapped to numeric features. A simple example can be + +``` +7,5,True,3 +6,3,False,4 +4,8,False,2 +9,3,True,3 +``` + +The above dataset will be represented as + +``` +7,5,0,3 +6,3,1,4 +4,8,1,2 +9,3,0,3 +``` + +Here Mappings are + +- `True` mapped to `0` +- `False` mapped to `1` + +``` +**Note** DatasetMapper converts non-numeric values in the order +in which it encounters them in dataset. Therefore there is a chance that +`True` might get mapped to `0` if it encounters `True` before `False`. +This `0` and `1` are not to be confused with C++ bool notations. These +are mapping created by `mpack::DatasetMapper`. +``` + +DatasetMapper provides an easy API to load such data and stores all the +necessary information of the dataset. + +@section toc_datasetmapper Table of Contents + +A list of all sections + + - \ref intro_datasetmapper + - \ref toc_datasetmapper + - \ref load + - \ref dimensions + - \ref type + - \ref numofmappings + - \ref checkmappings + - \ref unmapstring + - \ref unmapvalue + +@section load Loading data + +To use \b DatasetMapper we have to call a specific overload of `data::Load()` +fucntion. + +@code +using namespace mlpack; + +arma::mat data; +data::DatasetMapper info; +data::Load("dataset.csv", data, info); +@endcode + +Dataset +``` +7, 5, True, 3 +6, 3, False, 4 +4, 8, False, 2 +9, 3, True, 3 +``` + +@section dimensions Dimensionality + +There are two ways to initialize a DatasetMapper object. + +* First is to initialize the object and set each property yourself. + +* Second is to pass the object to Load() in which case mlpack will populate +the object. If we use the latter option then the dimensionality will be same +as what's in the data file. + +@code +std::cout << info.Dimensionality(); +@endcode + +@code +4 +@endcode + +@section type Type of each Dimension + +Each dimension can be of either of the two types + - data::Datatype::numeric + - data::Datatype::categorical + +\c `Type(size_t dimension)` takes an argument dimension which is the row +number for which you want to know the type + +This will return an enum `data::Datatype`, which is casted to +`size_t` when we print them using `std::cout` + - 0 represents `data::Datatype::numeric` + - 1 represents `data::Datatype::categorical` + +@code +std::cout << info.Type(0) << "\n"; +std::cout << info.Type(1) << "\n"; +std::cout << info.Type(2) << "\n"; +std::cout << info.Type(3) << "\n"; +@endcode + +@code +0 +0 +1 +0 +@endcode + +@section numofmappings Number of Mappings + +If the type of a dimension is `data::Datatype::categorical`, then during +loading, each unique token in that dimension will be mapped to an integer +starting with 0. + +\b NumMappings(size_t dimension) takes dimension as an argument and returns the number of +mappings in that dimension, if the dimension is a number or there are no mappings then it +will return 0. + +@code +std::cout << info.NumMappings(0) << "\n"; +std::cout << info.NumMappings(1) << "\n"; +std::cout << info.NumMappings(2) << "\n"; +std::cout << info.NumMappings(3) << "\n"; +@endcode + +@code +0 +0 +2 +0 +@endcode + +@section checkmappings Check Mappings + +There are two ways to check the mappings. + - Enter the string to get mapped integer + - Enter the mapped integer to get string + +@subsection unmapstring UnmapString + +\b UnmapString(int value, size_t dimension, size_t unmappingIndex = 0UL) + - value is the integer for which you want to find the mapped value + - dimension is the dimension in which you want to check the mappings + +@code +std::cout << info.UnmapString(0, 2) << "\n"; +std::cout << info.UnmapString(1, 2) << "\n"; +@endcode + +@code +T +F +@endcode + +@subsection unmapvalue UnmapValue + +\b UnmapValue(const std::string &input, size_t dimension) + - input is the mapped value for which you want to find mapping + - dimension is the dimension in which you want to find the mapped value + +@code +std::cout << info.UnmapValue("T", 2) << "\n"; +std::cout << info.UnmapValue("F", 2) << "\n"; +@endcode + +@code +0 +1 +@endcode + +These are basic uses of DatasetMapper. Some advance use cases will be added soon. + +*/ diff --git a/doc/tutorials/tutorials.txt b/doc/tutorials/tutorials.txt index 6f6bb7356d9..1fda3371553 100644 --- a/doc/tutorials/tutorials.txt +++ b/doc/tutorials/tutorials.txt @@ -59,6 +59,7 @@ mlpack. - \ref bindings - \ref cv - \ref hpt_guide + - \ref datasetmapper @section policy_tut Policy Class Documentation diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index d9d706745ca..33f0c23dabf 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -10,14 +10,14 @@ set(SOURCES has_serialize.hpp is_naninf.hpp load_csv.hpp - load_csv.cpp + load_numeric_csv.hpp + load_categorical_csv.hpp load.hpp load_image_impl.hpp load_image.cpp load_model_impl.hpp load_vec_impl.hpp load_impl.hpp - load.cpp load_arff.hpp load_arff_impl.hpp normalize_labels.hpp @@ -26,6 +26,7 @@ set(SOURCES save_impl.hpp save_image.cpp split_data.hpp + string_algorithms.hpp imputer.hpp binarize.hpp string_encoding.hpp @@ -34,6 +35,8 @@ set(SOURCES confusion_matrix.hpp one_hot_encoding.hpp one_hot_encoding_impl.hpp + types.hpp + types_impl.hpp ) # add directory name to sources diff --git a/src/mlpack/core/data/detect_file_type.cpp b/src/mlpack/core/data/detect_file_type.cpp index c00954f1c07..0219f95074c 100644 --- a/src/mlpack/core/data/detect_file_type.cpp +++ b/src/mlpack/core/data/detect_file_type.cpp @@ -14,10 +14,7 @@ */ #include "extension.hpp" #include "detect_file_type.hpp" - -#include -#include -#include +#include "string_algorithms.hpp" namespace mlpack { namespace data { @@ -27,18 +24,18 @@ namespace data { * * @param type Type to get the logical name of. */ -std::string GetStringType(const arma::file_type& type) +std::string GetStringType(const FileType& type) { switch (type) { - case arma::csv_ascii: return "CSV data"; - case arma::raw_ascii: return "raw ASCII formatted data"; - case arma::raw_binary: return "raw binary formatted data"; - case arma::arma_ascii: return "Armadillo ASCII formatted data"; - case arma::arma_binary: return "Armadillo binary formatted data"; - case arma::pgm_binary: return "PGM data"; - case arma::hdf5_binary: return "HDF5 data"; - default: return ""; + case FileType::CSVASCII: return "CSV data"; + case FileType::RawASCII: return "raw ASCII formatted data"; + case FileType::RawBinary: return "raw binary formatted data"; + case FileType::ArmaASCII: return "Armadillo ASCII formatted data"; + case FileType::ArmaBinary: return "Armadillo binary formatted data"; + case FileType::PGMBinary: return "PGM data"; + case FileType::HDF5Binary: return "HDF5 data"; + default: return ""; } } @@ -53,7 +50,7 @@ std::string GetStringType(const arma::file_type& type) * * @param f Opened istream to look into to guess the file type. */ -arma::file_type GuessFileType(std::istream& f) +FileType GuessFileType(std::istream& f) { f.clear(); const std::fstream::pos_type pos1 = f.tellg(); @@ -74,7 +71,7 @@ arma::file_type GuessFileType(std::istream& f) // Handle empty files. if (nMax == 0) - return arma::file_type_unknown; + return FileType::FileTypeUnknown; const arma::uword nUse = std::min(nMax, arma::uword(4096)); @@ -92,7 +89,7 @@ arma::file_type GuessFileType(std::istream& f) if (!loadOkay) { delete[] dataMem; - return arma::file_type_unknown; + return FileType::FileTypeUnknown; } bool hasBinary = false; @@ -168,12 +165,12 @@ arma::file_type GuessFileType(std::istream& f) delete[] dataMem; if (hasBinary) - return arma::raw_binary; + return FileType::RawBinary; if (hasComma && (hasBracket == false)) - return arma::csv_ascii; + return FileType::CSVASCII; - return arma::raw_ascii; + return FileType::RawASCII; } /** @@ -189,22 +186,22 @@ arma::file_type GuessFileType(std::istream& f) * @param filename Name of the file. * @return The detected file type. */ -arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) +FileType AutoDetect(std::fstream& stream, const std::string& filename) { // Get the extension. std::string extension = Extension(filename); - arma::file_type detectedLoadType = arma::file_type_unknown; + FileType detectedLoadType = FileType::FileTypeUnknown; if (extension == "csv" || extension == "tsv") { detectedLoadType = GuessFileType(stream); - if (detectedLoadType == arma::csv_ascii) + if (detectedLoadType == FileType::CSVASCII) { if (extension == "tsv") Log::Warn << "'" << filename << "' is comma-separated, not " "tab-separated!" << std::endl; } - else if (detectedLoadType == arma::raw_ascii) // .csv file can be tsv. + else if (detectedLoadType == FileType::RawASCII) // .csv file can be tsv. { if (extension == "csv") { @@ -214,7 +211,7 @@ arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) const std::streampos pos = stream.tellg(); std::string line; std::getline(stream, line, '\n'); - boost::trim(line); + Trim(line); // Reset stream position. stream.seekg(pos); @@ -231,7 +228,7 @@ arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) } else { - detectedLoadType = arma::file_type_unknown; + detectedLoadType = FileType::FileTypeUnknown; } } else if (extension == "txt") @@ -251,15 +248,15 @@ arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) if (rawHeader == ARMA_MAT_TXT) { - detectedLoadType = arma::arma_ascii; + detectedLoadType = FileType::ArmaASCII; } else // It's not arma_ascii. Now we let Armadillo guess. { detectedLoadType = GuessFileType(stream); - if (detectedLoadType != arma::raw_ascii && - detectedLoadType != arma::csv_ascii) - detectedLoadType = arma::file_type_unknown; + if (detectedLoadType != FileType::RawASCII && + detectedLoadType != FileType::CSVASCII) + detectedLoadType = FileType::FileTypeUnknown; } } else if (extension == "bin") @@ -277,25 +274,25 @@ arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) if (rawHeader == ARMA_MAT_BIN) { - detectedLoadType = arma::arma_binary; + detectedLoadType = FileType::ArmaBinary; } else // We can only assume it's raw binary. { - detectedLoadType = arma::raw_binary; + detectedLoadType = FileType::RawBinary; } } else if (extension == "pgm") { - detectedLoadType = arma::pgm_binary; + detectedLoadType = FileType::PGMBinary; } else if (extension == "h5" || extension == "hdf5" || extension == "hdf" || extension == "he5") { - detectedLoadType = arma::hdf5_binary; + detectedLoadType = FileType::HDF5Binary; } else // Unknown extension... { - detectedLoadType = arma::file_type_unknown; + detectedLoadType = FileType::FileTypeUnknown; } return detectedLoadType; @@ -307,34 +304,34 @@ arma::file_type AutoDetect(std::fstream& stream, const std::string& filename) * @param filename Name of the file whose type we should detect. * @return Detected type of file. */ -arma::file_type DetectFromExtension(const std::string& filename) +FileType DetectFromExtension(const std::string& filename) { const std::string extension = Extension(filename); if (extension == "csv") { - return arma::csv_ascii; + return FileType::CSVASCII; } else if (extension == "txt") { - return arma::raw_ascii; + return FileType::RawASCII; } else if (extension == "bin") { - return arma::arma_binary; + return FileType::ArmaBinary; } else if (extension == "pgm") { - return arma::pgm_binary; + return FileType::PGMBinary; } else if (extension == "h5" || extension == "hdf5" || extension == "hdf" || extension == "he5") { - return arma::hdf5_binary; + return FileType::HDF5Binary; } else { - return arma::file_type_unknown; + return FileType::FileTypeUnknown; } } diff --git a/src/mlpack/core/data/detect_file_type.hpp b/src/mlpack/core/data/detect_file_type.hpp index 8856de29fee..14a9fc4a6d5 100644 --- a/src/mlpack/core/data/detect_file_type.hpp +++ b/src/mlpack/core/data/detect_file_type.hpp @@ -15,6 +15,8 @@ #ifndef MLPACK_CORE_DATA_DETECT_FILE_TYPE_HPP #define MLPACK_CORE_DATA_DETECT_FILE_TYPE_HPP +#include "types.hpp" + namespace mlpack { namespace data { @@ -23,7 +25,7 @@ namespace data { * * @param type Type to get the logical name of. */ -std::string GetStringType(const arma::file_type& type); +std::string GetStringType(const FileType& type); /** * Given an istream, attempt to guess the file type. This is taken originally @@ -36,7 +38,7 @@ std::string GetStringType(const arma::file_type& type); * * @param f Opened istream to look into to guess the file type. */ -arma::file_type GuessFileType(std::istream& f); +FileType GuessFileType(std::istream& f); /** * Attempt to auto-detect the type of a file given its extension, and by @@ -51,8 +53,8 @@ arma::file_type GuessFileType(std::istream& f); * @param filename Name of the file. * @return The detected file type. arma::file_type_unknown if unknown. */ -arma::file_type AutoDetect(std::fstream& stream, - const std::string& filename); +FileType AutoDetect(std::fstream& stream, + const std::string& filename); /** * Return the type based only on the extension. @@ -60,7 +62,7 @@ arma::file_type AutoDetect(std::fstream& stream, * @param filename Name of the file whose type we should detect. * @return Detected type of file. arma::file_type_unknown if unknown. */ -arma::file_type DetectFromExtension(const std::string& filename); +FileType DetectFromExtension(const std::string& filename); } // namespace data } // namespace mlpack diff --git a/src/mlpack/core/data/load.cpp b/src/mlpack/core/data/load.cpp deleted file mode 100644 index 71ad35cb643..00000000000 --- a/src/mlpack/core/data/load.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/** - * @file core/data/load.cpp - * @author Tham Ngap Wei - * - * Force instantiation of some Load() overloads to reduce compile time. - * - * mlpack is free software; you may redistribute it and/or modify it under the - * terms of the 3-clause BSD license. You should have received a copy of the - * 3-clause BSD license along with mlpack. If not, see - * http://www.opensource.org/licenses/BSD-3-Clause for more information. - */ -#include "load.hpp" -#include "load_impl.hpp" - -namespace mlpack { -namespace data /** Functions to load and save matrices and models. */ { - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -template bool Load(const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -} // namespace data -} // namespace mlpack diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp index 7f3b7069298..bd0fd7a0664 100644 --- a/src/mlpack/core/data/load.hpp +++ b/src/mlpack/core/data/load.hpp @@ -21,6 +21,9 @@ #include "format.hpp" #include "dataset_mapper.hpp" #include "image_info.hpp" +#include "load_csv.hpp" +#include "load_arff.hpp" +#include "detect_file_type.hpp" namespace mlpack { namespace data /** Functions to load and save matrices and models. */ { @@ -71,7 +74,7 @@ bool Load(const std::string& filename, arma::Mat& matrix, const bool fatal = false, const bool transpose = true, - const arma::file_type inputLoadType = arma::auto_detect); + const FileType inputLoadType = FileType::AutoDetect); /** * Loads a sparse matrix from file, using arma::coord_ascii format. This @@ -108,85 +111,6 @@ bool Load(const std::string& filename, const bool fatal = false, const bool transpose = true); -/** - * Don't document these with doxygen; these declarations aren't helpful to - * users. - * - * @cond - */ - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -// size_t and uword should be one of these three typedefs. -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::Mat&, - const bool, - const bool, - const arma::file_type); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -extern template bool Load(const std::string&, - arma::SpMat&, - const bool, - const bool); - -/** - * @endcond - */ - /** * Load a column vector from a file, guessing the filetype from the extension. * @@ -294,52 +218,6 @@ bool Load(const std::string& filename, const bool fatal = false, const bool transpose = true); -/** - * Don't document these with doxygen; they aren't helpful for users to know - * about. - * - * @cond - */ - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -extern template bool Load( - const std::string&, - arma::Mat&, - DatasetMapper&, - const bool, - const bool); - -/** - * @endcond - */ - /** * Load a model from a file, guessing the filetype from the extension, or, * optionally, loading the specified format. If automatic extension detection @@ -415,6 +293,8 @@ bool LoadImage(const std::string& filename, } // namespace data } // namespace mlpack +// Include implementation of Load() for matrix. +#include "load_impl.hpp" // Include implementation of model-loading Load() overload. #include "load_model_impl.hpp" // Include implementation of Load() for vectors. diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 0cb7903f808..33be092cfa2 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -14,8 +14,7 @@ // In case it hasn't been included yet. #include "load_arff.hpp" - -#include +#include "string_algorithms.hpp" #include "is_naninf.hpp" namespace mlpack { @@ -47,7 +46,7 @@ void LoadARFF(const std::string& filename, { // Read the next line, then strip whitespace from either side. std::getline(ifs, line, '\n'); - boost::trim(line); + Trim(line); ++headerLines; // Is the first character a comment, or is the line empty? @@ -103,10 +102,10 @@ void LoadARFF(const std::string& filename, // `origDimType` string here instead (which has not had ::tolower used // on it). types.push_back(true); - boost::trim_if(origDimType, + TrimIf(origDimType, [](char c) { - return c == '{' || c == '}' || c == ' ' || c == '\t'; + return c == '{' || c == '}' || c == ' ' || c == '\t'; }); boost::escaped_list_separator sep("\\", ",", "\"'"); @@ -117,7 +116,7 @@ void LoadARFF(const std::string& filename, while (it != dimTok.end()) { std::string category = (*it); - boost::trim(category); + Trim(category); categories.push_back(category); ++it; @@ -199,7 +198,7 @@ void LoadARFF(const std::string& filename, while (ifs.good()) { std::getline(ifs, line, '\n'); - boost::trim(line); + Trim(line); // Each line of the @data section must be a CSV (except sparse data, which // we will handle later). So now we can tokenize the // CSV and parse it. The '?' representing a missing value is not allowed, @@ -233,7 +232,7 @@ void LoadARFF(const std::string& filename, { // Strip spaces before mapping. std::string token = *it; - boost::trim(token); + Trim(token); const size_t currentNumMappings = info.NumMappings(col); const eT result = info.template MapString(token, col); @@ -273,7 +272,7 @@ void LoadARFF(const std::string& filename, // error, otherwise we issue a general error. std::stringstream error; std::string tokenStr = token.str(); - boost::trim(tokenStr); + Trim(tokenStr); if (tokenStr == "?") error << "Missing values ('?') not supported, "; else diff --git a/src/mlpack/core/data/load_categorical_csv.hpp b/src/mlpack/core/data/load_categorical_csv.hpp new file mode 100644 index 00000000000..87d6b41adcc --- /dev/null +++ b/src/mlpack/core/data/load_categorical_csv.hpp @@ -0,0 +1,350 @@ +/** + * @file core/data/load_categorical_csv.hpp + * @author Gopi Tatiraju + * + * Load a matrix from file. Matrix may contain categorical data. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef MLPACK_CORE_DATA_LOAD_CATEGORICAL_CSV_HPP +#define MLPACK_CORE_DATA_LOAD_CATEGORICAL_CSV_HPP + +#include "load_csv.hpp" + +namespace mlpack{ +namespace data{ + +template +void LoadCSV::LoadCategoricalCSV(arma::Mat &inout, + DatasetMapper &infoSet, + const bool transpose) +{ + CheckOpen(); + + if (transpose) + TransposeParse(inout, infoSet); + else + NonTransposeParse(inout, infoSet); +} + +inline void LoadCSV::CategoricalMatSize( + std::stringstream& lineStream, size_t& col, const char delim) +{ + std::string token; + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + while (token[token.size() - 1] != '"') + std::getline(lineStream, token, delim); + } + ++col; + } +} + +template +void LoadCSV::InitializeTransposeMapper(size_t& rows, size_t& cols, + DatasetMapper& info) +{ + // Take a pass through the file. If the DatasetMapper policy requires it, + // we will pass everything as string through MapString(). This might be useful + // if, e.g., the MapPolicy needs to find which dimensions are numeric or + // categorical. + + // Reset to the start of the file. + inFile.clear(); + inFile.seekg(0, std::ios::beg); + rows = 0; + cols = 0; + + std::string line; + while (inFile.good()) + { + ++cols; + + if (cols == 1) + { + // Extract the number of dimensions. + std::pair dimen = GetMatrixSize(inFile, delim); + rows = dimen.second; + + if (info.Dimensionality() == 0) + { + info.SetDimensionality(rows); + } + else if (info.Dimensionality() != rows) + { + std::ostringstream oss; + oss << "data::LoadCSV(): given DatasetInfo has dimensionality " + << info.Dimensionality() << ", but data has dimensionality " + << rows; + throw std::invalid_argument(oss.str()); + } + } + + std::getline(inFile, line); + // Remove whitespaces from either side. + Trim(line); + + // If it's an empty line decrease cols and break. + if (line.size() == 0) + { + --cols; + continue; + } + + // If we need to do a first pass for the DatasetMapper, do it. + if (MapPolicy::NeedsFirstPass) + { + // In this case we must pass everything we parse to the MapPolicy. + size_t dim = 0; + std::stringstream lineStream; + std::string token; + + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + // Remove whitespace from either side + Trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + std::string tok = token; + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + token = tok; + } + info.template MapFirstPass(std::move(token), dim++); + } + } + } +} + +template +void LoadCSV::InitializeMapper(size_t& rows, size_t& cols, DatasetMapper& info) +{ + // Take a pass through the file. If the DatasetMapper policy requires it, + // we will pass everything as string through MapString(). This might be useful + // if, e.g., the MapPolicy needs to find which dimensions are numeric or + // categorical. + + // Reset to the start of the file. + inFile.clear(); + inFile.seekg(0, std::ios::beg); + rows = 0; + cols = 0; + + // First, count the number of rows in the file (this is the dimensionality). + std::string line; + while (std::getline(inFile, line)) + ++rows; + + // Reset the DatasetInfo object, if needed. + if (info.Dimensionality() == 0) + { + info.SetDimensionality(rows); + } + else if (info.Dimensionality() != rows) + { + std::ostringstream oss; + oss << "data::LoadCSV(): given DatasetInfo has dimensionality " + << info.Dimensionality() << ", but data has dimensionality " + << rows; + throw std::invalid_argument(oss.str()); + } + + // Now, jump back to the beginning of the file. + inFile.clear(); + inFile.seekg(0, std::ios::beg); + rows = 0; + + while (std::getline(inFile, line)) + { + ++rows; + // Remove whitespaces from either side. + Trim(line); + if (rows == 1) + { + // Extract the number of columns. + std::pair dimen = GetMatrixSize(inFile, delim); + cols = dimen.second; + } + + // I guess this is technically a second pass, but that's ok... still the + // same idea... + if (MapPolicy::NeedsFirstPass) + { + std::string str(line.begin(), line.end()); + std::stringstream lineStream; + std::string token; + + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + // Remove whitespace from either side. + Trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + std::string tok = token; + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + token = tok; + } + info.template MapFirstPass(std::move(token), rows - 1); + } + } + } +} + +template +void LoadCSV::TransposeParse(arma::Mat& inout, DatasetMapper& infoSet) +{ + // Get matrix size. This also initializes infoSet correctly. + size_t rows, cols; + InitializeTransposeMapper(rows, cols, infoSet); + + // Set the matrix size. + inout.set_size(rows, cols); + + // Initialize auxiliary variables. + size_t row = 0; + size_t col = 0; + std::string line; + inFile.clear(); + inFile.seekg(0, std::ios::beg); + + while (std::getline(inFile, line)) + { + // Remove whitespaces from either side. + Trim(line); + // Reset the row we are looking at. (Remember this is transposed.) + row = 0; + std::stringstream lineStream; + std::string token; + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + // Remove whitespaces from either side. + Trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + // First part of the string. + std::string tok = token; + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + token = tok; + } + inout(row, col) = infoSet.template MapString(std::move(token), row); + row++; + } + + // Make sure we got the right number of rows. + if (row != rows) + { + std::ostringstream oss; + oss << "LoadCSV::TransposeParse(): wrong number of dimensions (" << row + << ") on line " << col << "; should be " << rows << " dimensions."; + throw std::runtime_error(oss.str()); + } + // Increment the column index. + ++col; + } +} + +template +void LoadCSV::NonTransposeParse(arma::Mat& inout, + DatasetMapper& infoSet) +{ + // Get the size of the matrix. + size_t rows, cols; + InitializeMapper(rows, cols, infoSet); + + // Set up output matrix. + inout.set_size(rows, cols); + size_t row = 0; + size_t col = 0; + + // Reset file position. + std::string line; + inFile.clear(); + inFile.seekg(0, std::ios::beg); + + while (std::getline(inFile, line)) + { + // Remove whitespaces from either side. + Trim(line); + + std::stringstream lineStream; + std::string token; + + lineStream.clear(); + lineStream.str(line); + + while (lineStream.good()) + { + if (token == "\t") + token.clear(); + + std::getline(lineStream, token, delim); + // Remove whitespace from either side. + Trim(token); + + if (token[0] == '"' && token[token.size() - 1] != '"') + { + std::string tok = token; + while (token[token.size() - 1] != '"') + { + tok += delim; + std::getline(lineStream, token, delim); + tok += token; + } + token = tok; + } + inout(row, col++) = infoSet.template MapString(std::move(token), row); + } + + // Make sure we got the right number of rows. + if (col != cols) + { + std::ostringstream oss; + oss << "LoadCSV::NonTransposeParse(): wrong number of dimensions (" + << col << ") on line " << row << "; should be " << cols + << " dimensions."; + throw std::runtime_error(oss.str()); + } + ++row; col = 0; + } +} + +} //namespace data +} //namespace mlpack + +#endif diff --git a/src/mlpack/core/data/load_csv.cpp b/src/mlpack/core/data/load_csv.cpp deleted file mode 100644 index 5a150088232..00000000000 --- a/src/mlpack/core/data/load_csv.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/** - * @file core/data/load_csv.cpp - * @author Tham Ngap Wei - * @author Mehul Kumar Nirala - * - * A CSV reader that uses boost::spirit. - * - * mlpack is free software; you may redistribute it and/or modify it under the - * terms of the 3-clause BSD license. You should have received a copy of the - * 3-clause BSD license along with mlpack. If not, see - * http://www.opensource.org/licenses/BSD-3-Clause for more information. - */ -#include "load_csv.hpp" - -using namespace boost::spirit; - -namespace mlpack { -namespace data { - -LoadCSV::LoadCSV(const std::string& file) : - extension(Extension(file)), - filename(file), - inFile(file) -{ - // Attempt to open stream. - CheckOpen(); - - //! Spirit rule for parsing quoted string. - boost::spirit::qi::rule quotedRule; - // Match quoted strings as: "string" or 'string' - quotedRule = qi::raw[(qi::char_("'") >> *((qi::char_ - "'") | - "'" >> qi::char_("'")) >> "'") | - (qi::char_('"') >> *((qi::char_ - '"') | - '"' >> qi::char_('"')) >> '"') ]; - - // Set rules. - if (extension == "csv") - { - // Match all characters that are not ',', '\r', or '\n'. - stringRule = quotedRule.copy() | qi::raw[*~qi::char_(",\r\n")]; - } - else if (extension == "txt") - { - // Match all characters that are not ' ', ',', '\r', or '\n'. - stringRule = quotedRule.copy() | qi::raw[*~qi::char_(" ,\r\n")]; - } - else // TSV. - { - // Match all characters that are not '\t', '\r', or '\n'. - stringRule = quotedRule.copy() | qi::raw[*~qi::char_("\t\r\n")]; - } - - if (extension == "csv") - { - // Extract a single comma as the delimiter, catching whitespace on either - // side. - delimiterRule = qi::raw[(*qi::char_(" ") >> qi::char_(",") >> - *qi::char_(" "))]; - } - else if (extension == "txt") - { - // This one is a little more difficult, we need to catch any number of - // spaces more than one. - delimiterRule = qi::raw[+qi::char_(" ")]; - } - else // TSV. - { - // Catch a tab character, possibly with whitespace on either side. - delimiterRule = qi::raw[(*qi::char_(" ") >> qi::char_("\t") >> - *qi::char_(" "))]; - } -} - -void LoadCSV::CheckOpen() -{ - if (!inFile.is_open()) - { - std::ostringstream oss; - oss << "Cannot open file '" << filename << "'. " << std::endl; - throw std::runtime_error(oss.str()); - } - - inFile.unsetf(std::ios::skipws); -} - -} // namespace data -} // namespace mlpack diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp index ce7d0bb9b18..e0eb527665a 100644 --- a/src/mlpack/core/data/load_csv.hpp +++ b/src/mlpack/core/data/load_csv.hpp @@ -1,8 +1,38 @@ /** * @file core/data/load_csv.hpp * @author ThamNgapWei + * @author Conrad Sanderson + * @author Gopi M. Tatiraju * - * This is a csv parsers which use to parse the csv file format + * This csv parser is designed by taking reference from + * armadillo's csv parser. In this mlpack's version, all + * the arma dependencies were removed or replaced + * accordingly, making the parser totally independent of + * armadillo. + * + * As the implementation is inspired from Armadillo it + * is necessary to add two different licenses. One for + * Armadillo and another for mlpack. + * + * https://gitlab.com/conradsnicta/armadillo-code/-/blob/10.5.x/include/armadillo_bits/diskio_meat.hpp + * + * The original Armadillo parser is licensed under the + * BSD-compatible Apache license, shown below: + * + * Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) + * Copyright 2008-2016 National ICT Australia (NICTA) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ------------------------------------------------------------------------ * * mlpack is free software; you may redistribute it and/or modify it under the * terms of the 3-clause BSD license. You should have received a copy of the @@ -12,389 +42,275 @@ #ifndef MLPACK_CORE_DATA_LOAD_CSV_HPP #define MLPACK_CORE_DATA_LOAD_CSV_HPP -#include -#include - -#include #include - #include #include +#include "string_algorithms.hpp" #include "extension.hpp" #include "format.hpp" #include "dataset_mapper.hpp" +#include "types.hpp" namespace mlpack { namespace data { /** - *Load the csv file.This class use boost::spirit - *to implement the parser, please refer to following link - *http://theboostcpplibraries.com/boost.spirit for quick review. + * Load the csv file. This class contains functions + * to load numeric and categorical data. */ class LoadCSV { public: - /** - * Construct the LoadCSV object on the given file. This will construct the - * rules necessary for loading and attempt to open the file. - */ - LoadCSV(const std::string& file); - /** - * Load the file into the given matrix with the given DatasetMapper object. - * Throws exceptions on errors. - * - * @param inout Matrix to load into. - * @param infoSet DatasetMapper to use while loading. - * @param transpose If true, the matrix should be transposed on loading - * (default). - */ - template - void Load(arma::Mat &inout, - DatasetMapper &infoSet, - const bool transpose = true) + LoadCSV() { - CheckOpen(); - - if (transpose) - TransposeParse(inout, infoSet); - else - NonTransposeParse(inout, infoSet); + // Nothing to do here. + // To initialize the class object. } /** - * Peek at the file to determine the number of rows and columns in the matrix, - * assuming a non-transposed matrix. This will also take a first pass over - * the data for DatasetMapper, if MapPolicy::NeedsFirstPass is true. The info - * object will be re-initialized with the correct dimensionality. - * - * @param rows Variable to be filled with the number of rows. - * @param cols Variable to be filled with the number of columns. - * @param info DatasetMapper object to use for first pass. - */ - template - void GetMatrixSize(size_t& rows, size_t& cols, DatasetMapper& info) + * Construct the LoadCSV object on the given file. This will construct the + * rules necessary for loading and will attempt to open the file. This will also + * initialize the delimiter character for parsing. + * + * @param file path of the dataset. + */ + LoadCSV(const std::string& file) : + extension(Extension(file)), + filename(file), + inFile(file) { - using namespace boost::spirit; - - // Take a pass through the file. If the DatasetMapper policy requires it, - // we will pass everything string through MapString(). This might be useful - // if, e.g., the MapPolicy needs to find which dimensions are numeric or - // categorical. - - // Reset to the start of the file. - inFile.clear(); - inFile.seekg(0, std::ios::beg); - rows = 0; - cols = 0; - - // First, count the number of rows in the file (this is the dimensionality). - std::string line; - while (std::getline(inFile, line)) + if (extension == "csv") { - ++rows; + delim = ','; } - - // Reset the DatasetInfo object, if needed. - if (info.Dimensionality() == 0) + else if (extension == "tsv") { - info.SetDimensionality(rows); + delim = '\t'; } - else if (info.Dimensionality() != rows) + else if (extension == "txt") { - std::ostringstream oss; - oss << "data::LoadCSV(): given DatasetInfo has dimensionality " - << info.Dimensionality() << ", but data has dimensionality " - << rows; - throw std::invalid_argument(oss.str()); + delim = ' '; } - // Now, jump back to the beginning of the file. - inFile.clear(); - inFile.seekg(0, std::ios::beg); - rows = 0; - - while (std::getline(inFile, line)) - { - ++rows; - // Remove whitespace from either side. - boost::trim(line); - - if (rows == 1) - { - // Extract the number of columns. - auto findColSize = [&cols](iter_type) { ++cols; }; - qi::parse(line.begin(), line.end(), - stringRule[findColSize] % delimiterRule); - } - - // I guess this is technically a second pass, but that's ok... still the - // same idea... - if (MapPolicy::NeedsFirstPass) - { - // In this case we must pass everything we parse to the MapPolicy. - auto firstPassMap = [&](const iter_type& iter) - { - std::string str(iter.begin(), iter.end()); - boost::trim(str); - - info.template MapFirstPass(std::move(str), rows - 1); - }; - - // Now parse the line. - qi::parse(line.begin(), line.end(), - stringRule[firstPassMap] % delimiterRule); - } - } + CheckOpen(); } + // Functions for Numeric Parser + + /** + * Returns a bool value showing whether data was loaded successfully or not. + * + * Parses a csv file and loads the data into the given matrix. In the first pass, + * the function will determine the number of cols and rows in the given file. + * Once the rows and cols are fixed we initialize the matrix with zeros. In + * the second pass, the function converts each value to required datatype + * and sets it equal to val. + * + * @param x Matrix in which data will be loaded. + * @param f File stream to access the data file. + */ + template + bool LoadNumericCSV(arma::Mat& x, std::fstream& f); + /** - * Peek at the file to determine the number of rows and columns in the matrix, - * assuming a transposed matrix. This will also take a first pass over the - * data for DatasetMapper, if MapPolicy::NeedsFirstPass is true. The info - * object will be re-initialized with the correct dimensionality. + * Converts the given string token to assigned datatype and assigns + * this value to the given address. The address here will be a + * matrix location eg. matrix(row, col). + * + * Token is always read as a string, if the given token is +/-INF or NAN + * it converts them to infinity and NAN using numeric_limits. + * + * @param val Token's value will be assigned to this address. + * @param token Value which should be assigned. + */ + template + bool ConvertToken(eT& val, const std::string& token); + + /** + * Calculate the number of columns in each row + * and assign the value to the col. This function + * will work only for numeric data. * - * @param rows Variable to be filled with the number of rows. - * @param cols Variable to be filled with the number of columns. - * @param info DatasetMapper object to use for first pass. + * @param lineStream a single row of data. + * @param col number of columns in lineStream. + * @param delim delimiter character. */ - template - void GetTransposeMatrixSize(size_t& rows, - size_t& cols, - DatasetMapper& info) - { - using namespace boost::spirit; + inline void NumericMatSize(std::stringstream& lineStream, size_t& col, + const char delim); - // Take a pass through the file. If the DatasetMapper policy requires it, - // we will pass everything string through MapString(). This might be useful - // if, e.g., the MapPolicy needs to find which dimensions are numeric or - // categorical. + // Functions for Categorical Parse. - // Reset to the start of the file. - inFile.clear(); - inFile.seekg(0, std::ios::beg); - rows = 0; - cols = 0; + /** + * Load the file into the given matrix with the given DatasetMapper object. + * Throws exceptions on errors. + * + * @param inout Matrix to load into. + * @param infoSet DatasetMapper to use while loading. + * @param transpose If true, the matrix should be transposed on loading(default). + */ + template + void LoadCategoricalCSV(arma::Mat &inout, + DatasetMapper &infoSet, + const bool transpose = true); - std::string line; - while (std::getline(inFile, line)) - { - ++cols; - // Remove whitespace from either side. - boost::trim(line); - - if (cols == 1) - { - // Extract the number of dimensions. - auto findRowSize = [&rows](iter_type) { ++rows; }; - qi::parse(line.begin(), line.end(), - stringRule[findRowSize] % delimiterRule); - - // Reset the DatasetInfo object, if needed. - if (info.Dimensionality() == 0) - { - info.SetDimensionality(rows); - } - else if (info.Dimensionality() != rows) - { - std::ostringstream oss; - oss << "data::LoadCSV(): given DatasetInfo has dimensionality " - << info.Dimensionality() << ", but data has dimensionality " - << rows; - throw std::invalid_argument(oss.str()); - } - } - - // If we need to do a first pass for the DatasetMapper, do it. - if (MapPolicy::NeedsFirstPass) - { - size_t dim = 0; - - // In this case we must pass everything we parse to the MapPolicy. - auto firstPassMap = [&](const iter_type& iter) - { - std::string str(iter.begin(), iter.end()); - boost::trim(str); - - info.template MapFirstPass(std::move(str), dim++); - }; - - // Now parse the line. - qi::parse(line.begin(), line.end(), - stringRule[firstPassMap] % delimiterRule); - } - } - } + /** + * Peek at the file to determine the number of rows and columns in the matrix, + * assuming a non-transposed matrix. This will also take a first pass over + * the data for DatasetMapper, if MapPolicy::NeedsFirstPass is true. The info + * object will be re-initialized with the correct dimensionality. + * + * @param rows Variable to be filled with the number of rows. + * @param cols Variable to be filled with the number of columns. + * @param info DatasetMapper object to use for first pass. + */ + template + void InitializeMapper(size_t& rows, size_t& cols, + DatasetMapper& info); - private: - using iter_type = boost::iterator_range; + /** + * Peek at the file to determine the number of rows and columns in the matrix, + * assuming a transposed matrix. This will also take a first pass over the + * data for DatasetMapper, if MapPolicy::NeedsFirstPass is true. The info + * object will be re-initialized with the correct dimensionality. + * + * @param rows Variable to be filled with the number of rows. + * @param cols Variable to be filled with the number of columns. + * @param info DatasetMapper object to use for first pass. + */ + template + void InitializeTransposeMapper(size_t& rows, size_t& cols, + DatasetMapper& info); /** - * Check whether or not the file has successfully opened; throw an exception - * if not. + * Calculate the number of columns in each row + * and assign the value to the col. This function + * will work for categorical data. + * + * @param lineStream a single row of data. + * @param col the number of columns in lineStream. + * @param delim the delimiter character. */ - void CheckOpen(); + inline void CategoricalMatSize(std::stringstream& lineStream, size_t& col, + const char delim); + + // Functions common to both numeric & categorical parser. /** - * Parse a non-transposed matrix. + * Get the size of the matrix. Based on isNumeric the function can be used + * for both numeric_parse and categorical_parse. * - * @param inout Matrix to load into. - * @param infoSet DatasetMapper object to load with. + * @param f fstream stream to open the data file. + * @param delim char delimiter charecter. */ - template - void NonTransposeParse(arma::Mat& inout, - DatasetMapper& infoSet) + template + inline std::pair GetMatrixSize(std::fstream& f, + const char delim = ',') { - using namespace boost::spirit; + bool loadOkay = f.good(); - // Get the size of the matrix. - size_t rows, cols; - GetMatrixSize(rows, cols, infoSet); + f.clear(); + const std::fstream::pos_type pos1 = f.tellg(); - // Set up output matrix. - inout.set_size(rows, cols); - size_t row = 0; - size_t col = 0; + size_t fnRows = 0; + size_t fnCols = 0; + std::string lineString; + std::stringstream lineStream; + std::string token; - // Reset file position. - std::string line; - inFile.clear(); - inFile.seekg(0, std::ios::beg); - - auto setCharClass = [&](iter_type const &iter) + while (f.good() && loadOkay) { - std::string str(iter.begin(), iter.end()); - if (str == "\t") - { - str.clear(); - } - boost::trim(str); + // Get a row of data. + std::getline(f, lineString); + if (lineString.size() == 0) + break; + + lineStream.clear(); + lineStream.str(lineString); + size_t lineNCols = 0; + + // Get number of columns based on the type of data. + if (isNumeric) + NumericMatSize(lineStream, lineNCols, delim); + else + CategoricalMatSize(lineStream, lineNCols, delim); + + // If there are different number of columns in each + // row, then the highest number of cols will be + // considered as the size of the matrix. Missing + // elements will be filled as 0. + if (fnCols < lineNCols) + fnCols = lineNCols; + + ++fnRows; + } - inout(row, col++) = infoSet.template MapString(std::move(str), row); - }; + f.clear(); + f.seekg(pos1); - while (std::getline(inFile, line)) - { - // Remove whitespace from either side. - boost::trim(line); - - // Parse the numbers from a line (ex: 1,2,3,4); if the parser finds a - // number it will execute the setNum function. - const bool canParse = qi::parse(line.begin(), line.end(), - stringRule[setCharClass] % delimiterRule); - - // Make sure we got the right number of rows. - if (col != cols) - { - std::ostringstream oss; - oss << "LoadCSV::NonTransposeParse(): wrong number of dimensions (" - << col << ") on line " << row << "; should be " << cols - << " dimensions."; - throw std::runtime_error(oss.str()); - } - - if (!canParse) - { - std::ostringstream oss; - oss << "LoadCSV::NonTransposeParse(): parsing error on line " << col - << "!"; - throw std::runtime_error(oss.str()); - } - - ++row; col = 0; - } + std::pair mat_size(fnRows, fnCols); + + return mat_size; } - /** - * Parse a transposed matrix. - * - * @param inout Matrix to load into. - * @param infoSet DatasetMapper to load with. - */ - template - void TransposeParse(arma::Mat& inout, DatasetMapper& infoSet) - { - using namespace boost::spirit; - - // Get matrix size. This also initializes infoSet correctly. - size_t rows, cols; - GetTransposeMatrixSize(rows, cols, infoSet); - - // Set the matrix size. - inout.set_size(rows, cols); - - // Initialize auxiliary variables. - size_t row = 0; - size_t col = 0; - std::string line; - inFile.clear(); - inFile.seekg(0, std::ios::beg); - - /** - * This is the parse rule for strings. When we get a string we have to pass - * it to the DatasetMapper. - */ - auto parseString = [&](iter_type const &iter) - { - // All parsed values must be mapped. - std::string str(iter.begin(), iter.end()); - boost::trim(str); - inout(row, col) = infoSet.template MapString(std::move(str), row); - ++row; - }; + private: - while (std::getline(inFile, line)) + /** + * Check whether or not the file has successfully opened; throw an exception + * if not. + */ + inline void CheckOpen() + { + // Check if the file is opening. + if (!inFile.is_open()) { - // Remove whitespace from either side. - boost::trim(line); - - // Reset the row we are looking at. (Remember this is transposed.) - row = 0; - - // Now use boost::spirit to parse the characters of the line; - // parseString() will be called when a token is detected. - const bool canParse = qi::parse(line.begin(), line.end(), - stringRule[parseString] % delimiterRule); - - // Make sure we got the right number of rows. - if (row != rows) - { - std::ostringstream oss; - oss << "LoadCSV::TransposeParse(): wrong number of dimensions (" << row - << ") on line " << col << "; should be " << rows << " dimensions."; - throw std::runtime_error(oss.str()); - } - - if (!canParse) - { - std::ostringstream oss; - oss << "LoadCSV::TransposeParse(): parsing error on line " << col - << "!"; - throw std::runtime_error(oss.str()); - } - - // Increment the column index. - ++col; + std::ostringstream oss; + oss << "Cannot open file '" << filename << "'. " << std::endl; + // Throw an exception if the file is not opening. + throw std::runtime_error(oss.str()); } + + // Clear format flag. + inFile.unsetf(std::ios::skipws); } - //! Spirit rule for parsing. - boost::spirit::qi::rule stringRule; - //! Spirit rule for delimiters (i.e. ',' for CSVs). - boost::spirit::qi::rule delimiterRule; + // Functions for Categorical Parse. + + /** + * Parse a non-transposed matrix. + * + * @param input Matrix to load into. + * @param infoSet DatasetMapper object to load with. + */ + template + void NonTransposeParse(arma::Mat& inout, + DatasetMapper& infoSet); + + /** + * Parse a transposed matrix. + * + * @param input Matrix to load into. + * @param infoSet DatasetMapper to load with. + */ + template + void TransposeParse(arma::Mat& inout, DatasetMapper& infoSet); //! Extension (type) of file. std::string extension; //! Name of file. std::string filename; //! Opened stream for reading. - std::ifstream inFile; + std::fstream inFile; + //! Delimiter char. + char delim; }; } // namespace data } // namespace mlpack +#include "load_numeric_csv.hpp" +#include "load_categorical_csv.hpp" + #endif diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 344a8b6c13b..226960a7e03 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -1,6 +1,7 @@ /** * @file core/data/load_impl.hpp * @author Ryan Curtin + * @author Gopi Tatiraju * * Implementation of templatized load() function defined in load.hpp. * @@ -13,21 +14,16 @@ #define MLPACK_CORE_DATA_LOAD_IMPL_HPP // In case it hasn't already been included. +#include "load.hpp" #include #include #include -#include "load_csv.hpp" -#include "load.hpp" #include "extension.hpp" #include "detect_file_type.hpp" -#include -#include -#include - -#include "load_arff.hpp" +#include "string_algorithms.hpp" namespace mlpack { namespace data { @@ -43,7 +39,7 @@ std::vector ToTokens(Tokenizer& lineTok) [&tokens](std::string const &str) { std::string trimmedToken(str); - boost::trim(trimmedToken); + Trim(trimmedToken); return std::move(trimmedToken); }); @@ -90,12 +86,13 @@ bool Load(const std::string& filename, arma::Mat& matrix, const bool fatal, const bool transpose, - const arma::file_type inputLoadType) + const FileType inputLoadType) { Timer::Start("loading_data"); // Catch nonexistent files by opening the stream ourselves. std::fstream stream; + #ifdef _WIN32 // Always open in binary mode on Windows. stream.open(filename.c_str(), std::fstream::in | std::fstream::binary); #else @@ -113,14 +110,14 @@ bool Load(const std::string& filename, return false; } - arma::file_type loadType = inputLoadType; + FileType loadType = inputLoadType; std::string stringType; - if (inputLoadType == arma::auto_detect) + if (inputLoadType == FileType::AutoDetect) { // Attempt to auto-detect the type from the given file. loadType = AutoDetect(stream, filename); // Provide error if we don't know the type. - if (loadType == arma::file_type_unknown) + if (loadType == FileType::FileTypeUnknown) { Timer::Stop("loading_data"); if (fatal) @@ -137,7 +134,7 @@ bool Load(const std::string& filename, stringType = GetStringType(loadType); #ifndef ARMA_USE_HDF5 - if (inputLoadType == arma::hdf5_binary) + if (inputLoadType == FileType::HDF5Binary) { // Ensure that HDF5 is supported. Timer::Stop("loading_data"); @@ -155,7 +152,7 @@ bool Load(const std::string& filename, #endif // Try to load the file; but if it's raw_binary, it could be a problem. - if (loadType == arma::raw_binary) + if (loadType == FileType::RawBinary) Log::Warn << "Loading '" << filename << "' as " << stringType << "; " << "but this may not be the actual filetype!" << std::endl; else @@ -164,10 +161,17 @@ bool Load(const std::string& filename, // We can't use the stream if the type is HDF5. bool success; - if (loadType != arma::hdf5_binary) - success = matrix.load(stream, loadType); + LoadCSV loader; + + if (loadType != FileType::HDF5Binary) + { + if (loadType == FileType::CSVASCII) + success = loader.LoadNumericCSV(matrix, stream); + else + success = matrix.load(stream, ToArmaFileType(loadType)); + } else - success = matrix.load(filename, loadType); + success = matrix.load(filename, ToArmaFileType(loadType)); if (!success) { @@ -232,7 +236,7 @@ bool Load(const std::string& filename, try { LoadCSV loader(filename); - loader.Load(matrix, info, transpose); + loader.LoadCategoricalCSV(matrix, info, transpose); } catch (std::exception& e) { diff --git a/src/mlpack/core/data/load_numeric_csv.hpp b/src/mlpack/core/data/load_numeric_csv.hpp new file mode 100644 index 00000000000..a43bdd02de3 --- /dev/null +++ b/src/mlpack/core/data/load_numeric_csv.hpp @@ -0,0 +1,171 @@ +/** + * @file core/data/load_numeric_csv.hpp + * @author Gopi Tatiraju + * + * Load a matrix from file. Matrix should contain only numeric data. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef MLPACK_CORE_DATA_LOAD_NUMERIC_CSV_HPP +#define MLPACK_CORE_DATA_LOAD_NUMERIC_CSV_HPP + +#include "load_csv.hpp" + +namespace mlpack{ +namespace data{ + +template +bool LoadCSV::ConvertToken(eT& val, + const std::string& token) +{ + const size_t N = size_t(token.length()); + // Fill empty data points with 0. + if (N == 0) + { + val = eT(0); + return true; + } + + const char* str = token.c_str(); + + // Checks for +/-INF and NAN + // Converts them to their equivalent representation + // from numeric_limits. + if ((N == 3) || (N == 4)) + { + const bool neg = (str[0] == '-'); + const bool pos = (str[0] == '+'); + + const size_t offset = ((neg || pos) && (N == 4)) ? 1 : 0; + + const char sigA = str[offset]; + const char sigB = str[offset + 1]; + const char sigC = str[offset + 2]; + + if (((sigA == 'i') || (sigA == 'I')) && + ((sigB == 'n') || (sigB == 'N')) && + ((sigC == 'f') || (sigC == 'F'))) + { + val = neg ? -(std::numeric_limits + ::infinity()) : std::numeric_limits::infinity(); + return true; + } + else if (((sigA == 'n') || (sigA == 'N')) && + ((sigB == 'a') || (sigB == 'A')) && + ((sigC == 'n') || (sigC == 'N'))) + { + val = std::numeric_limits::quiet_NaN(); + return true; + } + } + + char* endptr = nullptr; + + // Convert the token into correct type. + // If we have a eT as unsigned int, + // it will convert all negative numbers to 0. + if (std::is_floating_point::value) + { + val = eT(std::strtod(str, &endptr)); + } + else if (std::is_integral::value) + { + if (std::is_signed::value) + val = eT(std::strtoll(str, &endptr, 10)); + else + { + if (str[0] == '-') + { + val = eT(0); + return true; + } + val = eT(std::strtoull(str, &endptr, 10)); + } + } + // If none of the above conditions was executed, + // then the conversion will fail. + else + return false; + + // If any of strtod() or strtoll() fails, str will + // be set to nullptr and this condition will be + // executed. + if (str == endptr) + return false; + + return true; +} + +template +bool LoadCSV::LoadNumericCSV(arma::Mat& x, std::fstream& f) +{ + bool loadOkay = f.good(); + f.clear(); + std::pair mat_size = GetMatrixSize(f); + x.zeros(mat_size.first, mat_size.second); + size_t row = 0; + + std::string lineString; + std::stringstream lineStream; + std::string token; + + while (f.good()) + { + // Parse the file line by line. + std::getline(f, lineString); + + if (lineString.size() == 0) + break; + + lineStream.clear(); + lineStream.str(lineString); + + size_t col = 0; + + while (lineStream.good()) + { + // Parse each line. + std::getline(lineStream, token, ','); + + // This will handle loading of both dense and sparse. + // Initialize tmp_val of type eT with value 0. + eT tmpVal = eT(0); + + if (ConvertToken(tmpVal, token)) + { + x.at(row, col) = tmpVal; + ++col; + } + else + { + // Printing failed token and it's location. + Log::Warn << "Failed to convert token " << token << ", at row " << row + << ", column " << col << " of matrix!"; + + return false; + } + } + ++row; + } + return loadOkay; +} + +inline void LoadCSV::NumericMatSize(std::stringstream& lineStream, + size_t& col, + const char delim) +{ + std::string token; + while (lineStream.good()) + { + std::getline(lineStream, token, delim); + ++col; + } +} + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/save.hpp b/src/mlpack/core/data/save.hpp index 685e7ebeb31..19e11ee1616 100644 --- a/src/mlpack/core/data/save.hpp +++ b/src/mlpack/core/data/save.hpp @@ -20,6 +20,7 @@ #include "format.hpp" #include "image_info.hpp" +#include "detect_file_type.hpp" namespace mlpack { namespace data /** Functions to load and save matrices. */ { @@ -64,7 +65,7 @@ bool Save(const std::string& filename, const arma::Mat& matrix, const bool fatal = false, bool transpose = true, - arma::file_type inputSaveType = arma::auto_detect); + FileType inputSaveType = FileType::AutoDetect); /** * Saves a sparse matrix to file, guessing the filetype from the diff --git a/src/mlpack/core/data/save_impl.hpp b/src/mlpack/core/data/save_impl.hpp index c51e666b1f9..18a353dbc69 100644 --- a/src/mlpack/core/data/save_impl.hpp +++ b/src/mlpack/core/data/save_impl.hpp @@ -15,7 +15,6 @@ // In case it hasn't already been included. #include "save.hpp" #include "extension.hpp" -#include "detect_file_type.hpp" #include #include @@ -28,7 +27,7 @@ template bool Save(const std::string& filename, const arma::Col& vec, const bool fatal, - arma::file_type inputSaveType) + FileType inputSaveType) { // Don't transpose: one observation per line (for CSVs at least). return Save(filename, vec, fatal, false, inputSaveType); @@ -38,7 +37,7 @@ template bool Save(const std::string& filename, const arma::Row& rowvec, const bool fatal, - arma::file_type inputSaveType) + FileType inputSaveType) { return Save(filename, rowvec, fatal, true, inputSaveType); } @@ -48,18 +47,18 @@ bool Save(const std::string& filename, const arma::Mat& matrix, const bool fatal, bool transpose, - arma::file_type inputSaveType) + FileType inputSaveType) { Timer::Start("saving_data"); - arma::file_type saveType = inputSaveType; + FileType saveType = inputSaveType; std::string stringType = ""; - if (inputSaveType == arma::auto_detect) + if (inputSaveType == FileType::AutoDetect) { // Detect the file type using only the extension. saveType = DetectFromExtension(filename); - if (saveType == arma::file_type_unknown) + if (saveType == FileType::FileTypeUnknown) { if (fatal) Log::Fatal << "Could not detect type of file '" << filename << "' for " @@ -105,11 +104,11 @@ bool Save(const std::string& filename, #ifdef ARMA_USE_HDF5 // We can't save with streams for HDF5. - const bool success = (saveType == arma::hdf5_binary) ? - tmp.quiet_save(filename, saveType) : - tmp.quiet_save(stream, saveType); + const bool success = (saveType == FileType::HDF5Binary) ? + tmp.quiet_save(filename, ToArmaFileType(saveType)) : + tmp.quiet_save(stream, ToArmaFileType(saveType)); #else - const bool success = tmp.quiet_save(stream, saveType); + const bool success = tmp.quiet_save(stream, ToArmaFileType(saveType)); #endif if (!success) { @@ -126,11 +125,11 @@ bool Save(const std::string& filename, { #ifdef ARMA_USE_HDF5 // We can't save with streams for HDF5. - const bool success = (saveType == arma::hdf5_binary) ? - matrix.quiet_save(filename, saveType) : - matrix.quiet_save(stream, saveType); + const bool success = (saveType == FileType::HDF5Binary) ? + matrix.quiet_save(filename, ToArmaFileType(saveType)) : + matrix.quiet_save(stream, ToArmaFileType(saveType)); #else - const bool success = matrix.quiet_save(stream, saveType); + const bool success = matrix.quiet_save(stream, ToArmaFileType(saveType)); #endif if (!success) { @@ -195,23 +194,23 @@ bool Save(const std::string& filename, } bool unknownType = false; - arma::file_type saveType; + FileType saveType; std::string stringType; if (extension == "txt" || extension == "tsv") { - saveType = arma::coord_ascii; + saveType = FileType::CoordASCII; stringType = "raw ASCII formatted data"; } else if (extension == "bin") { - saveType = arma::arma_binary; + saveType = FileType::ArmaBinary; stringType = "Armadillo binary formatted data"; } else { unknownType = true; - saveType = arma::raw_binary; // Won't be used; prevent a warning. + saveType = FileType::RawBinary; // Won't be used; prevent a warning. stringType = ""; } @@ -241,7 +240,7 @@ bool Save(const std::string& filename, tmp = trans(matrix); } - const bool success = tmp.quiet_save(stream, saveType); + const bool success = tmp.quiet_save(stream, ToArmaFileType(saveType)); if (!success) { Timer::Stop("saving_data"); diff --git a/src/mlpack/core/data/string_algorithms.hpp b/src/mlpack/core/data/string_algorithms.hpp new file mode 100644 index 00000000000..5bc3291733e --- /dev/null +++ b/src/mlpack/core/data/string_algorithms.hpp @@ -0,0 +1,115 @@ +/** + * @file core/data/string_algorithms.hpp + * @author Gopi M. Tatiraju + * + * Utility functions related to string manipulation + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MLPACK_CORE_DATA_STRING_ALGORITHMS_HPP +#define MLPACK_CORE_DATA_STRING_ALGORITHMS_HPP + +namespace mlpack { +namespace data { + +/** + * A simple trim function to strip off whitespaces + * from both the sides of a string. If input is a string + * with all spaces then str will be empty string. + * + * @param str the string to be trimmed. + */ +inline void Trim(std::string& str) +{ + if (str.find_first_not_of(' ') == std::string::npos) + { + str = ""; + return; + } + + size_t startIndex = 0; + + while (std::isspace(str[startIndex])) + startIndex++; + + size_t endIndex = str.size() - 1; + + while (std::isspace(str[endIndex])) + endIndex--; + + std::string trimmedStr; + + // Using ternary operator is not recommended here. + // Ternary operator is only useful for simple expressions + // that don't involve varrying types. + // https://en.cppreference.com/w/cpp/language/operator_other + if (endIndex - startIndex == str.size()) + trimmedStr = std::move(str); + else + trimmedStr = str.substr(startIndex, endIndex - startIndex + 1); + + str = trimmedStr; +} + +/** + * Trim off characters from start and end of + * of the string. The supplied function is + * used to determine which characters will + * be trimmed off. + * + * @param str the string to be trimmed. + * @param func function to determine the characters which should be trimmed. + */ +inline void TrimIf(std::string &str, std::function func) +{ + if (str.find_first_not_of(' ') == std::string::npos) + { + str = ""; + return; + } + + size_t startIndex = 0; + + for (size_t i = 0; i < str.size(); i++) + { + bool match = func(str[i]); + + if (match) + startIndex++; + else + break; + } + + size_t endIndex = str.size() - 1; + + for (int i = str.size() - 1; i >= 0; i--) + { + bool match = func(str[i]); + if (match) + endIndex--; + else + break; + } + + std::string trimmedStr; + + // Using ternary operator is not recommended here. + // Ternary operator is only useful for simple expressions + // that don't involve varrying types. + // https://en.cppreference.com/w/cpp/language/operator_other + if (endIndex - startIndex == str.size()) + trimmedStr = std::move(str); + else + trimmedStr = str.substr(startIndex, endIndex - startIndex + 1); + + str = trimmedStr; +} + +} // namespace data +} // namespace mlpack + +#endif diff --git a/src/mlpack/core/data/types.hpp b/src/mlpack/core/data/types.hpp new file mode 100644 index 00000000000..ac1660eb88a --- /dev/null +++ b/src/mlpack/core/data/types.hpp @@ -0,0 +1,57 @@ +/** + * @file core/data/types.hpp + * @author Gopi M. Tatiraju + * + * This file contains utilitiy functions related to types of data. + * We have adapted all the standard types which are available in armadillo. + * + * This file also contains functions to convery external file types to mlpack + * file types. In future if we the user of mlpack needs support of an external + * linear algebra library like armadillo, all functions related to handling the + * types goes here. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MLPACK_CORE_DATA_TYPES_HPP +#define MLPACK_CORE_DATA_TYPES_HPP + +#include +#include + +namespace mlpack { +namespace data { + +enum struct FileType +{ + FileTypeUnknown, + AutoDetect, //!< attempt to automatically detect the file type + RawASCII, //!< raw text (ASCII), without a header + ArmaASCII, //!< Armadillo text format, with a header specifying matrix type and size + CSVASCII, //!< comma separated values (CSV), without a header + RawBinary, //!< raw binary format (machine dependent), without a header + ArmaBinary, //!< Armadillo binary format (machine dependent), with a header specifying matrix type and size + PGMBinary, //!< Portable Grey Map (greyscale image) + PPMBinary, //!< Portable Pixel Map (colour image), used by the field and cube classes + HDF5Binary, //!< HDF5: open binary format, not specific to Armadillo, which can store arbitrary data + CoordASCII //!< simple co-ordinate format for sparse matrices (indices start at zero) +}; + +/** + * This function is used to convert mlpack file types to + * their respective Armadillo file types. + * + * @param type mlpack::FileType. + */ +inline arma::file_type ToArmaFileType(const FileType& type); + +} // namespace data +} // namespace mlpack + +#include "types_impl.hpp" + +#endif + diff --git a/src/mlpack/core/data/types_impl.hpp b/src/mlpack/core/data/types_impl.hpp new file mode 100644 index 00000000000..18fd0a4a634 --- /dev/null +++ b/src/mlpack/core/data/types_impl.hpp @@ -0,0 +1,83 @@ +/** + * @file core/data/types.hpp + * @author Gopi M. Tatiraju + * + * This file contains utilitiy functions related to types of data. + * We have adapted all the standard types which are available in armadillo. + * + * This file also contains functions to convery external file types to mlpack + * file types. In future if we the user of mlpack needs support of an external + * linear algebra library like armadillo, all functions related to handling the + * types goes here. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MLPACK_CORE_DATA_TYPES_IMPL_HPP +#define MLPACK_CORE_DATA_TYPES_IMPL_HPP + +#include "types.hpp" + +namespace mlpack{ +namespace data{ + +inline arma::file_type ToArmaFileType(const FileType& type) +{ + switch(type) + { + case FileType::FileTypeUnknown: + return arma::file_type_unknown; + break; + + case FileType::AutoDetect: + return arma::auto_detect; + break; + + case FileType::RawASCII: + return arma::raw_ascii; + break; + + case FileType::ArmaASCII: + return arma::arma_ascii; + break; + + case FileType::CSVASCII: + return arma::csv_ascii; + break; + + case FileType::RawBinary: + return arma::raw_binary; + break; + + case FileType::ArmaBinary: + return arma::arma_binary; + break; + + case FileType::PGMBinary: + return arma::pgm_binary; + break; + + case FileType::PPMBinary: + return arma::ppm_binary; + break; + + case FileType::HDF5Binary: + return arma::hdf5_binary; + break; + + case FileType::CoordASCII: + return arma::coord_ascii; + break; + + default: + return arma::file_type_unknown; + break; + } +} + +} // namespace data +} // namespace mlpack +#endif diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp index 44e48e338e8..44f09fd8700 100644 --- a/src/mlpack/tests/load_save_test.cpp +++ b/src/mlpack/tests/load_save_test.cpp @@ -85,7 +85,7 @@ TEST_CASE("WrongExtensionCorrectLoad", "[LoadSaveTest]") // Now reload through our interface. REQUIRE( - data::Load("test_file.csv", test, false, true, arma::arma_binary) + data::Load("test_file.csv", test, false, true, FileType::ArmaBinary) == true); REQUIRE(test.n_rows == 4); @@ -269,7 +269,7 @@ TEST_CASE("LoadAnyExtensionFileTest", "[LoadSaveTest]") f.close(); arma::mat test; - REQUIRE(data::Load("test_file.blah", test, false, true, arma::raw_ascii)); + REQUIRE(data::Load("test_file.blah", test, false, true, FileType::RawASCII)); REQUIRE(test.n_rows == 4); REQUIRE(test.n_cols == 2); @@ -979,10 +979,10 @@ TEST_CASE("SaveArmaBinaryArbitraryExtensionTest", "[LoadSaveTest]") "4 8;"; REQUIRE(data::Save("test_file.blerp.blah", test, false, true, - arma::arma_binary) == true); + FileType::ArmaBinary) == true); REQUIRE(data::Load("test_file.blerp.blah", test, false, true, - arma::arma_binary) == true); + FileType::ArmaBinary) == true); REQUIRE(test.n_rows == 4); REQUIRE(test.n_cols == 2);