From 76b8f719887e55aab754600458f62e7fbb899651 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 24 Jul 2016 12:08:37 +0800 Subject: [PATCH 01/42] first commit --- .../core/data/file_reader/csv_reader.hpp | 252 +++++++++++++ .../core/data/file_reader/line_reader.hpp | 259 ++++++++++++++ src/mlpack/core/data/file_reader/parser.hpp | 249 +++++++++++++ src/mlpack/core/data/file_reader/policy.hpp | 212 +++++++++++ .../data/file_reader/reader_exceptions.hpp | 337 ++++++++++++++++++ 5 files changed, 1309 insertions(+) create mode 100644 src/mlpack/core/data/file_reader/csv_reader.hpp create mode 100644 src/mlpack/core/data/file_reader/line_reader.hpp create mode 100644 src/mlpack/core/data/file_reader/parser.hpp create mode 100644 src/mlpack/core/data/file_reader/policy.hpp create mode 100644 src/mlpack/core/data/file_reader/reader_exceptions.hpp diff --git a/src/mlpack/core/data/file_reader/csv_reader.hpp b/src/mlpack/core/data/file_reader/csv_reader.hpp new file mode 100644 index 00000000000..a8c1027e073 --- /dev/null +++ b/src/mlpack/core/data/file_reader/csv_reader.hpp @@ -0,0 +1,252 @@ +// Copyright: (2012-2015) Ben Strasser +// License: BSD-3 +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +//2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +//3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +#ifndef CSV_READER_HPP +#define CSV_READER_HPP + +#include "reader_exceptions.hpp" +#include "line_reader.hpp" +#include "parser.hpp" +#include "policy.hpp" + +#include +#include + +#include + +namespace mlpack{ + +namespace io{ + +template, + class QuotePolicy = NoQuoteEscape<','>, + class OverflowPolicy = ThrowOnOverflow, + class CommentPolicy = NoComment + > +class CSVReader{ + public: + CSVReader() = delete; + CSVReader(const CSVReader&) = delete; + CSVReader&operator=(const CSVReader&); + + template + explicit CSVReader(size_t column_count, Args&&...args) : + in(std::forward(args)...), + columncount(column_count), + columnNames(column_count), + colOrder(column_count), + row(column_count, nullptr) + { + std::iota(std::begin(colOrder), std::end(colOrder), 0); + for(size_t i=1; i<=column_count; ++i){ + columnNames[i-1] = "col" + std::to_string(i); + } + } + + static void FileDimension(std::string const &fileName, + std::string const &separators, + size_t &rows, size_t &cols) + { + LineReader reader(fileName); + rows = 0; cols = 0; + char *line = reader.NextLine(); + if(line){ + using Tokenizer = boost::tokenizer>; + boost::escaped_list_separator sep("\\", separators, "\""); + std::string buffer(line, std::strlen(line)); + Tokenizer tok(buffer, sep);{ + for(Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i) + ++cols; + } + + ++rows; + while(reader.NextLine()){ + ++rows; + } + } + } + + template + bool ReadRow(OutIter begin, OutIter end) + { + try{ + try{ + char *line = PruneComment(); + if(line){ + ParseLine(line, &row[0], colOrder); + }else{ + return false; + } + + ParseHelper(begin, end); + }catch(error::WithFileName &err){ + err.FileName(in.TruncatedFileName()); + throw; + } + }catch(error::WithFileLine&err){ + err.FileLine(in.FileLine()); + throw; + } + + return true; + } + + template + bool ReadRow(Container &colVals) + { + return ReadRow(std::begin(colVals), std::end(colVals)); + } + + template + bool ReadRow(ColType& ...cols) + { + try{ + try{ + char *line = PruneComment(); + if(line){ + ParseLine + (line, &row[0], colOrder); + ParseHelper(0, cols...); + }else{ + return false; + } + }catch(error::WithFileName &err){ + err.FileName(in.TruncatedFileName()); + throw; + } + }catch(error::WithFileLine&err){ + err.FileLine(in.FileLine()); + throw; + } + + return true; + } + + char* NextLine(){ + return in.NextLine(); + } + + void FileName(const std::string&file_name) + { + in.FileName(file_name); + } + + void FileName(const char*file_name) + { + in.FileName(file_name); + } + + const char* TruncatedFileName()const + { + return in.TruncatedFileName(); + } + + void FileLine(unsigned file_line) + { + in.FileLine(file_line); + } + + size_t FileLine()const + { + return in.FileLine(); + } + + private: + void ParseHelper(std::size_t){} + + template + void ParseHelper(std::size_t r, T&t, ColType&...cols) + { + if(row[r]){ + try{ + try{ + ::io::Parse(row[r], t); + }catch(error::WithColumnContent&err){ + err.ColumnContent(row[r]); + throw; + } + }catch(error::WithColumnName&err){ + err.ColumnName(columnNames[r].c_str()); + throw; + } + } + ParseHelper(r+1, cols...); + } + + template + void ParseHelper(OutIter begin, OutIter end) + { + std::size_t r = 0; + try{ + try{ + while(begin != end){ + if(row[r]){ + ::io::Parse(row[r++], *begin); + } + ++begin; + } + }catch(error::WithColumnContent&err){ + err.ColumnContent(row[r]); + throw; + } + }catch(error::WithColumnName&err){ + err.ColumnName(columnNames[r].c_str()); + throw; + } + } + + char* PruneComment() + { + char *line; + do{ + line = in.NextLine(); + if(!line){ + return nullptr; + } + }while(CommentPolicy::IsComment(line)); + + return line; + } + + LineReader in; + + size_t columncount; + std::vector columnNames; + std::vector colOrder; + std::vector row; +}; + +}//namespace io + +}//namespace mlpack + +#endif + diff --git a/src/mlpack/core/data/file_reader/line_reader.hpp b/src/mlpack/core/data/file_reader/line_reader.hpp new file mode 100644 index 00000000000..6448d5367fc --- /dev/null +++ b/src/mlpack/core/data/file_reader/line_reader.hpp @@ -0,0 +1,259 @@ +// Copyright: (2012-2015) Ben Strasser +// License: BSD-3 +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +//2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +//3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +#ifndef LINE_READER_HPP +#define LINE_READER_HPP + +#include +#include +#include +#include +#include +#include +#include + +namespace mlpack{ + +namespace io{ + +namespace detail{ + +class OwningStdIOByteSourceBase +{ +public: + explicit OwningStdIOByteSourceBase(std::unique_ptr file): + file(std::move(file)){ + std::ios_base::sync_with_stdio(false); + } + + int Read(char *buffer, const size_t size){ + file->read(buffer, size); + return static_cast(file->gcount()); + } + + ~OwningStdIOByteSourceBase() + { + std::ios_base::sync_with_stdio(true); + } + +private: + std::unique_ptr file; +}; + +class SynchronousReader{ +public: + void Init(std::unique_ptr arg_byte_source){ + byteSource = std::move(arg_byte_source); + } + + bool IsValid()const{ + return byteSource != nullptr; + } + + void PrepareRead(char*arg_buffer, int arg_desired_byte_count){ + buffer = arg_buffer; + desiredByteCount = arg_desired_byte_count; + } + + int FinishRead(){ + return byteSource->Read(buffer, desiredByteCount); + } +private: + std::unique_ptr byteSource; + char *buffer; + int desiredByteCount; +}; + +} //namespace details + +class LineReader{ +private: + //blockLen equal to the limit of one line + static constexpr size_t blockLen = 1<<24; + + detail::SynchronousReader reader; + std::vector buffer; + int dataBegin; + int dataEnd; + int lineLength; + + std::string fileName; + size_t fileLine; + + static std::unique_ptr OpenFile(const char *file_name) + { + std::unique_ptr file(new std::ifstream(file_name, std::ios::binary)); + if(!file->is_open()){ + int x = errno; // store errno as soon as possible, doing it after constructor call can fail. + error::CanNotOpenFile err; + err.Errno(x); + err.FileName(file_name); + } + + return std::unique_ptr + (new detail::OwningStdIOByteSourceBase(std::move(file))); + } + + void Init(std::unique_ptr byteSource) + { + fileLine = 0; + lineLength = 0; + + //Allocate 48MBytes to store char of files + //First block store the string we want to handle + //Second block store the extra string to handle + //after the First block is consumed + //Third block is the "prepare block", use to read more + //data from the file + buffer.resize(3*blockLen); + dataBegin = 0; + dataEnd = byteSource->Read(&buffer[0], 2*blockLen); + + //Ignore UTF-8 BOM + if(dataEnd >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF'){ + dataBegin = 3; + } + + //If the data of file is >= 2*blockLen, we need to do + //the prepare of reading more data + if(dataEnd == 2*blockLen){ + reader.Init(std::move(byteSource)); + reader.PrepareRead(&buffer[0] + 2*blockLen, blockLen); + } + } + +public: + LineReader() = delete; + LineReader(const LineReader&) = delete; + LineReader&operator=(const LineReader&) = delete; + + explicit LineReader(const char *fileName){ + FileName(fileName); + Init(OpenFile(fileName)); + } + + explicit LineReader(const std::string &fileName){ + FileName(fileName.c_str()); + Init(OpenFile(fileName.c_str())); + } + + void FileName(const std::string &fileName){ + FileName(fileName.c_str()); + } + + void FileName(const char* fileName) + { + this->fileName = fileName; + } + + const char* TruncatedFileName()const + { + return fileName.c_str(); + } + + void FileLine(size_t fileLine) + { + this->fileLine = fileLine; + } + + size_t FileLine()const + { + return fileLine; + } + + int LineLength() const + { + return lineLength; + } + + char* NextLine() + { + if(dataBegin == dataEnd){ + return nullptr; + } + + ++fileLine; + + assert(dataBegin < dataEnd); + assert(dataEnd <= blockLen*2); + + if(dataBegin >= blockLen){ + //first block has been processed, copy second block to first block + std::memcpy(&buffer[0], &buffer[0]+blockLen, blockLen); + dataBegin -= blockLen; + dataEnd -= blockLen; + //if the file >= 2 blockLen, that means we need to read more data + if(reader.IsValid()) + { + dataEnd += reader.FinishRead(); + std::memcpy(&buffer[0]+blockLen, &buffer[0]+2*blockLen, blockLen); + reader.PrepareRead(&buffer[0] + 2*blockLen, blockLen); + } + } + + int lineEnd = dataBegin; + while(buffer[lineEnd] != '\n' && lineEnd != dataEnd){ + ++lineEnd; + } + + if(lineEnd - dataBegin + 1 > blockLen){ + error::LineLengthLimitExceeded err; + err.FileName(fileName.c_str()); + err.FileLine(fileLine); + throw err; + } + + if(buffer[lineEnd] == '\n'){ + buffer[lineEnd] = '\0'; + }else{ + // some files are missing the newline at the end of the + // last line + ++dataEnd; + buffer[lineEnd] = '\0'; + } + + // handle windows \r\n-line breaks + if(lineEnd != dataBegin && buffer[lineEnd-1] == '\r'){ + buffer[lineEnd-1] = '\0'; + } + + char *ret = &buffer[0] + dataBegin; + lineLength = lineEnd - dataBegin - 1; + dataBegin = lineEnd + 1; + return ret; + } +}; + +} //namespace io + +} //namespace mlpack + +#endif diff --git a/src/mlpack/core/data/file_reader/parser.hpp b/src/mlpack/core/data/file_reader/parser.hpp new file mode 100644 index 00000000000..263bc1a167c --- /dev/null +++ b/src/mlpack/core/data/file_reader/parser.hpp @@ -0,0 +1,249 @@ +// Copyright: (2012-2015) Ben Strasser +// License: BSD-3 +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +//2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +//3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +#include +#include + +namespace mlpack{ + +namespace io{ + +template +void ChopNextColumn(char*& line, char*& colBegin, char*& colEnd) +{ + assert(line != nullptr); + + colBegin = line; + // the col_begin + (... - col_begin) removes the constness + colEnd = colBegin + (QuotePolicy::FindNextColumnEnd(colBegin) - colBegin); + + if(*colEnd == '\0'){ + line = nullptr; + }else{ + *colEnd = '\0'; + line = colEnd + 1; + } +} + +template +void ParseLine( + char *line, + char **sortedCol, + const std::vector &colOrder + ) +{ + for(std::size_t i=0; i(line, col_begin, col_end); + + if(colOrder[i] != -1){ + TrimPolicy::Trim(col_begin, col_end); + QuotePolicy::unescape(col_begin, col_end); + + sortedCol[colOrder[i]] = col_begin; + } + } + if(line != nullptr) + throw ::io::error::TooManyColumns(); +} + +template +void Parse(char *col, char &x){ + if(!*col) + throw error::InvalidSingleCharacter(); + x = *col; + ++col; + if(*col) + throw error::InvalidSingleCharacter(); +} + +template +void Parse(char *col, std::string &x){ + x = col; +} + +template +void Parse(char* col, const char*& x){ + x = col; +} + +template +void Parse(char*col, char*& x){ + x = col; +} + +template +void ParseUnsignedInteger(const char *col, T &x){ + x = 0; + while(*col != '\0'){ + if('0' <= *col && *col <= '9'){ + T y = *col - '0'; + if(x > (std::numeric_limits::max()-y)/10){ + OverFlowPolicy::OnOverFlow(x); + return; + } + x = 10*x+y; + }else + throw error::NoDigit(); + ++col; + } +} + +templatevoid Parse(char *col, unsigned char &x) +{ParseUnsignedInteger(col, x);} +templatevoid Parse(char *col, unsigned short &x) +{ParseUnsignedInteger(col, x);} +templatevoid Parse(char *col, unsigned int &x) +{ParseUnsignedInteger(col, x);} +templatevoid Parse(char *col, unsigned long &x) +{ParseUnsignedInteger(col, x);} +templatevoid Parse(char *col, unsigned long long &x) +{ParseUnsignedInteger(col, x);} + +template +void ParseSignedInteger(const char *col, T &x){ + if(*col == '-'){ + ++col; + + x = 0; + while(*col != '\0'){ + if('0' <= *col && *col <= '9'){ + T y = *col - '0'; + if(x < (std::numeric_limits::min()+y)/10){ + OverFlowPolicy::OnUnderFlow(x); + return; + } + x = 10*x-y; + }else + throw error::NoDigit(); + ++col; + } + return; + }else if(*col == '+') + ++col; + ParseUnsignedInteger(col, x); +} + +templatevoid Parse(char *col, signed char &x) +{ParseSignedInteger(col, x);} +templatevoid Parse(char *col, signed short &x) +{ParseSignedInteger(col, x);} +templatevoid Parse(char *col, signed int &x) +{ParseSignedInteger(col, x);} +templatevoid Parse(char *col, signed long &x) +{ParseSignedInteger(col, x);} +templatevoid Parse(char *col, signed long long &x) +{ParseSignedInteger(col, x);} + +template +void ParseFloat(const char *col, T &x){ + bool is_neg = false; + if(*col == '-'){ + is_neg = true; + ++col; + }else if(*col == '+'){ + ++col; + } + + x = 0; + while('0' <= *col && *col <= '9'){ + int const y = *col - '0'; + x *= 10; + x += static_cast(y); + ++col; + } + + if(*col == '.'|| *col == ','){ + ++col; + T pos = 1; + while('0' <= *col && *col <= '9'){ + pos /= 10; + int const y = *col - '0'; + ++col; + x += y*pos; + } + } + + if(*col == 'e' || *col == 'E'){ + ++col; + int e; + + ParseSignedInteger(col, e); + + if(e != 0){ + T base; + if(e < 0){ + base = 0.1; + e = -e; + }else{ + base = 10; + } + + while(e != 1){ + if((e & 1) == 0){ + base = base*base; + e >>= 1; + }else{ + x *= base; + --e; + } + } + x *= base; + } + }else{ + if(*col != '\0') + throw error::NoDigit(); + } + + if(is_neg) + x = -x; +} + +template void Parse(char *col, float &x) { ParseFloat(col, x); } +template void Parse(char *col, double &x) { ParseFloat(col, x); } +template void Parse(char *col, long double &x) { ParseFloat(col, x); } + +template +void Parse(char*, T&){ + // GCC evalutes "false" when reading the template and + // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why + // this strange construct is used. + static_assert(sizeof(T)!=sizeof(T), + "Can not parse this type. Only buildin integrals, floats, char, " + "char*, const char* and std::string are supported"); +} + +} //namespace io + +} //namespace mlpack diff --git a/src/mlpack/core/data/file_reader/policy.hpp b/src/mlpack/core/data/file_reader/policy.hpp new file mode 100644 index 00000000000..00379fb6cbd --- /dev/null +++ b/src/mlpack/core/data/file_reader/policy.hpp @@ -0,0 +1,212 @@ +#ifndef POLICY_HPP +#define POLICY_HPP + +namespace mlpack{ + +namespace io{ + +//typedef unsigned ignore_column; +//static const ignore_column ignore_no_column = 0; +//static const ignore_column ignore_extra_column = 1; +//static const ignore_column ignore_missing_column = 2; + +#include + +template +struct TrimChars +{ +private: + constexpr static bool IsTrimChar(char) + { + return false; + } + + template + constexpr static bool IsTrimChar(char c, char trimChar, OtherTrimChars...otherTrimChars) + { + return c == trimChar || IsTrimChar(c, otherTrimChars...); + } + +public: + static void Trim(char*&strBegin, char*&strEnd) + { + while(strBegin != strEnd && IsTrimChar(*strBegin, TrimCharList...)) + { + ++strBegin; + } + while(strBegin != strEnd && IsTrimChar(*(strEnd-1), TrimCharList...)) + { + --strEnd; + } + *strEnd = '\0'; + } +}; + + +struct NoComment +{ + static bool IsComment(const char*) + { + return false; + } +}; + +template +struct SingleLineComment +{ +private: + constexpr static bool IsCommentStartChar(char) + { + return false; + } + + template + constexpr static bool IsCommentStartChar(char c, char commentstartchar, + OtherCommentStartChars...othercommentstartchars) + { + return c == commentstartchar || IsCommentStartChar(c, othercommentstartchars...); + } + +public: + static bool IsComment(const char* line) + { + return IsCommentStartChar(*line, CommentStartCharList...); + } +}; + +struct EmptyLineComment +{ + static bool IsComment(const char* line) + { + if(*line == '\0') + { + return true; + } + + while(*line == ' ' || *line == '\t') + { + ++line; + if(*line == 0){ + return true; + } + } + return false; + } +}; + +template +struct SingleAndEmptyLineComment +{ + static bool IsComment(const char *line) + { + return SingleLineComment::IsComment(line) || + EmptyLineComment::IsComment(line); + } +}; + +template +struct NoQuoteEscape +{ + static const char* FindNextColumnEnd(const char*col_begin) + { + while(*col_begin != sep && *col_begin != '\0') + { + ++col_begin; + } + return col_begin; + } + + static void unescape(char*&, char*&) + { + + } +}; + +template +struct DoubleQuoteEscape +{ + static const char* FindNextColumnEnd(const char*col_begin) + { + while(*col_begin != sep && *col_begin != '\0') + if(*col_begin != quote) + { + ++col_begin; + }else{ + do{ + ++col_begin; + while(*col_begin != quote){ + if(*col_begin == '\0') + throw error::EscapedStringNotClosed(); + ++col_begin; + } + ++col_begin; + }while(*col_begin == quote); + } + return col_begin; + } + + static void unescape(char *&col_begin, char *&col_end) + { + if(col_end - col_begin >= 2){ + if(*col_begin == quote && *(col_end-1) == quote){ + ++col_begin; + --col_end; + char *out = col_begin; + for(char*in = col_begin; in!=col_end; ++in){ + if(*in == quote && *(in+1) == quote){ + ++in; + } + *out = *in; + ++out; + } + col_end = out; + *col_end = '\0'; + } + } + } +}; + +struct ThrowOnOverflow +{ + template + static void OnOverflow(T&) + { + throw error::IntegerOverflow(); + } + + template + static void OnUnderflow(T&) + { + throw error::IntegerUnderflow(); + } +}; + +struct IgnoreOverflow +{ + template + static void OnOverFlow(T&){} + + template + static void OnUnderFlow(T&){} +}; + +struct SetToMaxOnOverflow +{ + template + static void OnOverFlow(T&x) + { + x = std::numeric_limits::max(); + } + + template + static void OnUnderFlow(T&x) + { + x = std::numeric_limits::min(); + } +}; + +} //namespace io + +} //namespace mlpack + +#endif diff --git a/src/mlpack/core/data/file_reader/reader_exceptions.hpp b/src/mlpack/core/data/file_reader/reader_exceptions.hpp new file mode 100644 index 00000000000..75e7bced482 --- /dev/null +++ b/src/mlpack/core/data/file_reader/reader_exceptions.hpp @@ -0,0 +1,337 @@ +// Copyright: (2012-2015) Ben Strasser +// License: BSD-3 +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +//2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +//3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +#ifndef READER_EXCEPTIONS_HPP +#define READER_EXCEPTIONS_HPP + +#include +#include +#include +#include +#include + +namespace mlpack{ + +namespace io{ + +namespace error{ + +struct Base : std::exception +{ + virtual void FormatErrorMessage()const = 0; + + const char* what()const throw() + { + FormatErrorMessage(); + return errorMessageBuffer; + } + + mutable char errorMessageBuffer[256]; +}; + +struct WithFileName +{ + WithFileName(){ + } + + void FileName(const char*file_name) + { + fileName = file_name; + } + + std::string fileName; +}; + +struct WithFileLine +{ + WithFileLine() : + fileLine(0) + { + } + + void FileLine(size_t fileLine) + { + this->fileLine = fileLine; + } + + size_t fileLine; +}; + +struct WithErrno +{ + WithErrno(){ + errnoValue = 0; + } + + void Errno(int errno_value) + { + this->errnoValue = errnoValue; + } + + int errnoValue; +}; + +struct CanNotOpenFile : + Base, + WithFileName, + WithErrno +{ + void FormatErrorMessage()const + { + if(errnoValue != 0){ + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Can not open file \"%s\" because \"%s\"." + ,&fileName[0], std::strerror(errnoValue)); + }else{ + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Can not open file \"%s\"." + ,&fileName[0]); + } + } +}; + +struct LineLengthLimitExceeded : + Base, + WithFileName, + WithFileLine +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Line length %d in file \"%s\" exceeds the maximum length of 2^24-1." + , fileLine, &fileName[0]); + } +}; + +struct WithColumnName +{ + WithColumnName() + { + std::fill(std::begin(columnName), std::end(columnName), 0); + } + + void ColumnName(const char* columnName) + { + std::strncpy(this->columnName, columnName, maxColumnNameLength); + this->columnName[maxColumnNameLength] = '\0'; + } + + static constexpr int maxColumnNameLength = 63; + char columnName[maxColumnNameLength+1]; +}; + +struct WithColumnContent +{ + WithColumnContent(){ + std::memset(columnContent, 0, maxColumnContentLength+1); + } + + void ColumnContent(const char *columnContent){ + std::strncpy(this->columnContent, columnContent, maxColumnContentLength); + this->columnContent[maxColumnContentLength] = '\0'; + } + + static constexpr int maxColumnContentLength = 63; + char columnContent[maxColumnContentLength+1]; +}; + + +struct ExtraColumnInHeader : + Base, + WithFileName, + WithColumnName +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Extra column \"%s\" in header of file \"%s\"." + , columnName, &fileName[0]); + } +}; + +struct MissingColumnInHeader : + Base, + WithFileName, + WithColumnName{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Missing column \"%s\" in header of file \"%s\"." + , columnName, &fileName[0]); + } +}; + +struct DuplicatedColumnInHeader : + Base, + WithFileName, + WithColumnName +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Duplicated column \"%s\" in header of file \"%s\"." + , columnName, &fileName[0]); + } +}; + +struct HeaderMissing : + Base, + WithFileName +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Header missing in file \"%s\"." + , &fileName[0]); + } +}; + +struct TooFewColumns : + Base, + WithFileName, + WithFileLine +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Too few columns in line %d in file \"%s\"." + , fileLine, &fileName[0]); + } +}; + +struct TooManyColumns : + Base, + WithFileName, + WithFileLine +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Too many columns in line %d in file \"%s\"." + , fileLine, &fileName[0]); + } +}; + +struct EscapedStringNotClosed : + Base, + WithFileName, + WithFileLine +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Escaped string was not closed in line %d in file \"%s\"." + , fileLine, &fileName[0]); + } +}; + +struct IntegerMustBePositive : + Base, + WithFileName, + WithFileLine, + WithColumnName, + WithColumnContent +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "The integer \"%s\" must be positive or 0 in column \"%s\" in file \"%s\" in line \"%d\"." + , columnContent, columnName, &fileName[0], fileLine); + } +}; + +struct NoDigit : + Base, + WithFileName, + WithFileLine, + WithColumnName, + WithColumnContent +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "The integer \"%s\" contains an invalid digit in column \"%s\" in file \"%s\" in line \"%d\"." + , columnContent, columnName, &fileName[0], fileLine); + } +}; + +struct IntegerOverflow : + Base, + WithFileName, + WithFileLine, + WithColumnName, + WithColumnContent +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "The integer \"%s\" overflows in column \"%s\" in file \"%s\" in line \"%d\"." + , columnContent, columnName, &fileName[0], fileLine); + } +}; + +struct IntegerUnderflow : + Base, + WithFileName, + WithFileLine, + WithColumnName, + WithColumnContent +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "The integer \"%s\" underflows in column \"%s\" in file \"%s\" in line \"%d\"." + , columnContent, columnName, &fileName[0], fileLine); + } +}; + +struct InvalidSingleCharacter : + Base, + WithFileName, + WithFileLine, + WithColumnName, + WithColumnContent +{ + void FormatErrorMessage()const + { + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "The content \"%s\" of column \"%s\" in file \"%s\" in line \"%d\" is not a single character." + , columnContent, columnName, &fileName[0], fileLine); + } +}; + +}//namespace error + +} //namespace io + +} //namespace mlpack + +#endif From a479c4cf4f666781fe600e1f830953571205afbb Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 24 Jul 2016 12:10:25 +0800 Subject: [PATCH 02/42] first commit --- src/mlpack/core/data/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt index f11f19cc8c6..0b4ddfb7f8f 100644 --- a/src/mlpack/core/data/CMakeLists.txt +++ b/src/mlpack/core/data/CMakeLists.txt @@ -16,6 +16,11 @@ set(SOURCES serialization_shim.hpp split_data.hpp binarize.hpp + file_reader/csv_reader.hpp + file_reader/line_reader.hpp + file_reader/parser.hpp + file_reader/policy.hpp + file_reader/reader_exceptions.hpp ) # add directory name to sources From f09c01ba3f47e08699dddc083eb6042cf16c121d Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 24 Jul 2016 12:10:47 +0800 Subject: [PATCH 03/42] use fast csv parser to load csv and tsv file --- src/mlpack/core/data/load_impl.hpp | 36 +++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index 5479bab17d5..e9ac8934436 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -12,6 +12,7 @@ #include "extension.hpp" #include +#include #include #include @@ -31,6 +32,27 @@ namespace data { namespace details{ +template +bool ParseTextFile(std::string const &filename, std::string const &separator, + arma::Mat &matrix) +{ + bool success = true; + try{ + size_t rows, cols; + io::CSVReader::FileDimension(filename, separator, rows, cols); + matrix.set_size(rows, cols); + io::CSVReader<> reader(cols, filename); + T *begin = &matrix[0]; + while(reader.ReadRow(begin, begin + cols)){ + begin += cols; + } + }catch(std::exception const&){ + success = false; + } + + return success; +} + template std::vector ToTokens(Tokenizer &lineTok) { @@ -332,10 +354,18 @@ bool Load(const std::string& filename, // We can't use the stream if the type is HDF5. bool success; - if (loadType != arma::hdf5_binary) - success = matrix.load(stream, loadType); - else + if(loadType != arma::hdf5_binary){ + if(loadType == arma::csv_ascii){ + success = details::ParseTextFile>(filename, ",", matrix); + }else if(loadType == arma::arma_ascii){ + success = details::ParseTextFile>(filename, "\t", matrix); + }else{ + success = matrix.load(stream, loadType); + } + } + else{ success = matrix.load(filename, loadType); + } if (!success) { From 068c4399abaa2b9269f134ed447490ac4a3bff22 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 24 Jul 2016 13:04:43 +0800 Subject: [PATCH 04/42] 1 : fix bug, did not put the data into the matrix properly 2 : fix bug, do not use fast csv to load tsv or arma::arma_ascii file, since it may fail --- src/mlpack/core/data/load_impl.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp index e9ac8934436..a46ac0bf3dd 100644 --- a/src/mlpack/core/data/load_impl.hpp +++ b/src/mlpack/core/data/load_impl.hpp @@ -42,9 +42,13 @@ bool ParseTextFile(std::string const &filename, std::string const &separator, io::CSVReader::FileDimension(filename, separator, rows, cols); matrix.set_size(rows, cols); io::CSVReader<> reader(cols, filename); - T *begin = &matrix[0]; - while(reader.ReadRow(begin, begin + cols)){ - begin += cols; + size_t row = 0, col = 0; + std::vector vals(cols); + while(reader.ReadRow(vals)){ + for(auto const val : vals){ + matrix(row, col++) = val; + } + col = 0; ++row; } }catch(std::exception const&){ success = false; @@ -357,8 +361,6 @@ bool Load(const std::string& filename, if(loadType != arma::hdf5_binary){ if(loadType == arma::csv_ascii){ success = details::ParseTextFile>(filename, ",", matrix); - }else if(loadType == arma::arma_ascii){ - success = details::ParseTextFile>(filename, "\t", matrix); }else{ success = matrix.load(stream, loadType); } From 69f8a815a98ab58ca4dee29ae2e65ac4f5991051 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 24 Jul 2016 17:09:00 +0800 Subject: [PATCH 05/42] use boost format and string operation to generate error messages --- .../data/file_reader/reader_exceptions.hpp | 109 +++++++----------- 1 file changed, 43 insertions(+), 66 deletions(-) diff --git a/src/mlpack/core/data/file_reader/reader_exceptions.hpp b/src/mlpack/core/data/file_reader/reader_exceptions.hpp index 75e7bced482..a7e7ce35a22 100644 --- a/src/mlpack/core/data/file_reader/reader_exceptions.hpp +++ b/src/mlpack/core/data/file_reader/reader_exceptions.hpp @@ -32,6 +32,8 @@ #ifndef READER_EXCEPTIONS_HPP #define READER_EXCEPTIONS_HPP +#include + #include #include #include @@ -51,10 +53,10 @@ struct Base : std::exception const char* what()const throw() { FormatErrorMessage(); - return errorMessageBuffer; + return errorMessageBuffer.c_str(); } - mutable char errorMessageBuffer[256]; + mutable std::string errorMessageBuffer; }; struct WithFileName @@ -85,36 +87,13 @@ struct WithFileLine size_t fileLine; }; -struct WithErrno -{ - WithErrno(){ - errnoValue = 0; - } - - void Errno(int errno_value) - { - this->errnoValue = errnoValue; - } - - int errnoValue; -}; - struct CanNotOpenFile : Base, - WithFileName, - WithErrno + WithFileName { void FormatErrorMessage()const { - if(errnoValue != 0){ - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Can not open file \"%s\" because \"%s\"." - ,&fileName[0], std::strerror(errnoValue)); - }else{ - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Can not open file \"%s\"." - ,&fileName[0]); - } + errorMessageBuffer = "Can not open file \"" + fileName + "\""; } }; @@ -125,9 +104,8 @@ struct LineLengthLimitExceeded : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Line length %d in file \"%s\" exceeds the maximum length of 2^24-1." - , fileLine, &fileName[0]); + errorMessageBuffer = "Line length " +std::to_string(fileLine) + + " in file \"" + fileName + "\" exceeds the maximum length of 2^24-1."; } }; @@ -171,9 +149,8 @@ struct ExtraColumnInHeader : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Extra column \"%s\" in header of file \"%s\"." - , columnName, &fileName[0]); + errorMessageBuffer = std::string("Extra column \"") + columnName + + "\" in header of file \"" + fileName + "\""; } }; @@ -183,9 +160,8 @@ struct MissingColumnInHeader : WithColumnName{ void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Missing column \"%s\" in header of file \"%s\"." - , columnName, &fileName[0]); + errorMessageBuffer = (boost::format("Missing column \"%1%\" in header of file \"%2%\".") + % columnName % fileName).str(); } }; @@ -196,9 +172,8 @@ struct DuplicatedColumnInHeader : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Duplicated column \"%s\" in header of file \"%s\"." - , columnName, &fileName[0]); + errorMessageBuffer= (boost::format("Duplicated column \"%1%\" in header of file \"%2%\".") + % columnName % fileName).str(); } }; @@ -208,9 +183,8 @@ struct HeaderMissing : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Header missing in file \"%s\"." - , &fileName[0]); + errorMessageBuffer= (boost::format("Header missing in file \"%1%\".") + % fileName).str(); } }; @@ -221,9 +195,8 @@ struct TooFewColumns : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Too few columns in line %d in file \"%s\"." - , fileLine, &fileName[0]); + errorMessageBuffer= (boost::format("Too few columns in line %1% in file \"%2%\".") + % fileLine % fileName).str(); } }; @@ -234,9 +207,8 @@ struct TooManyColumns : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Too many columns in line %d in file \"%s\"." - , fileLine, &fileName[0]); + errorMessageBuffer= (boost::format("Too many columns in line %1% in file \"%2%\".") + % fileLine % fileName).str(); } }; @@ -247,9 +219,8 @@ struct EscapedStringNotClosed : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Escaped string was not closed in line %d in file \"%s\"." - , fileLine, &fileName[0]); + errorMessageBuffer= (boost::format("Escaped string was not closed in line %1% in file \"%2%\".") + % fileLine % fileName).str(); } }; @@ -262,9 +233,10 @@ struct IntegerMustBePositive : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "The integer \"%s\" must be positive or 0 in column \"%s\" in file \"%s\" in line \"%d\"." - , columnContent, columnName, &fileName[0], fileLine); + errorMessageBuffer= (boost::format("The integer \"%1%\" must be positive or 0 in " + "column \"%2%\" in file \"%3%\" in line \"%4%\".") + % columnContent % columnName % + fileName % fileLine).str(); } }; @@ -277,9 +249,10 @@ struct NoDigit : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "The integer \"%s\" contains an invalid digit in column \"%s\" in file \"%s\" in line \"%d\"." - , columnContent, columnName, &fileName[0], fileLine); + errorMessageBuffer= (boost::format("The integer \"%1%\" contains an invalid digit in column " + "\"%2%\" in file \"%3%\" in line \"%4%\".") + % columnContent % columnName % + fileName % fileLine).str(); } }; @@ -292,9 +265,10 @@ struct IntegerOverflow : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "The integer \"%s\" overflows in column \"%s\" in file \"%s\" in line \"%d\"." - , columnContent, columnName, &fileName[0], fileLine); + errorMessageBuffer= (boost::format("The integer \"%1%\" overflows in column \"%2%\" " + "in file \"%3%\" in line \"%4%\".") + % columnContent % columnName % + fileName % fileLine).str(); } }; @@ -307,9 +281,10 @@ struct IntegerUnderflow : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "The integer \"%s\" underflows in column \"%s\" in file \"%s\" in line \"%d\"." - , columnContent, columnName, &fileName[0], fileLine); + errorMessageBuffer= (boost::format("The integer \"%1%\" underflows in column \"%2%\" " + "in file \"%3%\" in line \"%4%\".") + % columnContent % columnName % + fileName % fileLine).str(); } }; @@ -322,9 +297,11 @@ struct InvalidSingleCharacter : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "The content \"%s\" of column \"%s\" in file \"%s\" in line \"%d\" is not a single character." - , columnContent, columnName, &fileName[0], fileLine); + errorMessageBuffer= (boost::format("The content \"%1%\" of column \"%2%\" " + "in file \"%3%\", in line \"%4%\" is not a " + "single character.") + % columnContent % columnName % + fileName % fileLine).str(); } }; From 1669a22ee5827b29423399771422631612f1ef77 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 24 Jul 2016 17:25:59 +0800 Subject: [PATCH 06/42] remove useless codes --- src/mlpack/core/data/file_reader/line_reader.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mlpack/core/data/file_reader/line_reader.hpp b/src/mlpack/core/data/file_reader/line_reader.hpp index 6448d5367fc..25675236f0b 100644 --- a/src/mlpack/core/data/file_reader/line_reader.hpp +++ b/src/mlpack/core/data/file_reader/line_reader.hpp @@ -112,10 +112,9 @@ class LineReader{ { std::unique_ptr file(new std::ifstream(file_name, std::ios::binary)); if(!file->is_open()){ - int x = errno; // store errno as soon as possible, doing it after constructor call can fail. error::CanNotOpenFile err; - err.Errno(x); err.FileName(file_name); + throw err; } return std::unique_ptr From f848d8eb1ba3fc684158278045e15ae24157c901 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 24 Jul 2016 17:26:28 +0800 Subject: [PATCH 07/42] fix bug--wrong namespace --- src/mlpack/core/data/file_reader/csv_reader.hpp | 4 ++-- src/mlpack/core/data/file_reader/parser.hpp | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/mlpack/core/data/file_reader/csv_reader.hpp b/src/mlpack/core/data/file_reader/csv_reader.hpp index a8c1027e073..d72dbb46be5 100644 --- a/src/mlpack/core/data/file_reader/csv_reader.hpp +++ b/src/mlpack/core/data/file_reader/csv_reader.hpp @@ -188,7 +188,7 @@ class CSVReader{ if(row[r]){ try{ try{ - ::io::Parse(row[r], t); + Parse(row[r], t); }catch(error::WithColumnContent&err){ err.ColumnContent(row[r]); throw; @@ -209,7 +209,7 @@ class CSVReader{ try{ while(begin != end){ if(row[r]){ - ::io::Parse(row[r++], *begin); + Parse(row[r++], *begin); } ++begin; } diff --git a/src/mlpack/core/data/file_reader/parser.hpp b/src/mlpack/core/data/file_reader/parser.hpp index 263bc1a167c..7bd785b4fbb 100644 --- a/src/mlpack/core/data/file_reader/parser.hpp +++ b/src/mlpack/core/data/file_reader/parser.hpp @@ -62,7 +62,7 @@ void ParseLine( { for(std::size_t i=0; i(line, col_begin, col_end); @@ -75,7 +75,7 @@ void ParseLine( } } if(line != nullptr) - throw ::io::error::TooManyColumns(); + throw error::TooManyColumns(); } template @@ -114,8 +114,9 @@ void ParseUnsignedInteger(const char *col, T &x){ return; } x = 10*x+y; - }else + }else{ throw error::NoDigit(); + } ++col; } } @@ -150,8 +151,9 @@ void ParseSignedInteger(const char *col, T &x){ ++col; } return; - }else if(*col == '+') + }else if(*col == '+'){ ++col; + } ParseUnsignedInteger(col, x); } @@ -222,8 +224,9 @@ void ParseFloat(const char *col, T &x){ x *= base; } }else{ - if(*col != '\0') + if(*col != '\0'){ throw error::NoDigit(); + } } if(is_neg) From 9165f9e70f32820ead19b9ed5731bd6e5f9614a3 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 24 Jul 2016 17:51:24 +0800 Subject: [PATCH 08/42] change char array to std::string --- .../data/file_reader/reader_exceptions.hpp | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/mlpack/core/data/file_reader/reader_exceptions.hpp b/src/mlpack/core/data/file_reader/reader_exceptions.hpp index a7e7ce35a22..6008d257c6e 100644 --- a/src/mlpack/core/data/file_reader/reader_exceptions.hpp +++ b/src/mlpack/core/data/file_reader/reader_exceptions.hpp @@ -111,34 +111,21 @@ struct LineLengthLimitExceeded : struct WithColumnName { - WithColumnName() - { - std::fill(std::begin(columnName), std::end(columnName), 0); - } - void ColumnName(const char* columnName) { - std::strncpy(this->columnName, columnName, maxColumnNameLength); - this->columnName[maxColumnNameLength] = '\0'; + this->columnName = columnName; } - static constexpr int maxColumnNameLength = 63; - char columnName[maxColumnNameLength+1]; + std::string columnName; }; struct WithColumnContent { - WithColumnContent(){ - std::memset(columnContent, 0, maxColumnContentLength+1); - } - void ColumnContent(const char *columnContent){ - std::strncpy(this->columnContent, columnContent, maxColumnContentLength); - this->columnContent[maxColumnContentLength] = '\0'; + this->columnContent = columnContent; } - static constexpr int maxColumnContentLength = 63; - char columnContent[maxColumnContentLength+1]; + std::string columnContent; }; From 8330f78af97cb314e54b7ed77ac3d4b0e50ecfdf Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 18:36:10 +0800 Subject: [PATCH 09/42] 1 : fix warning 2 : fix miss include file --- src/mlpack/core/data/file_reader/line_reader.hpp | 2 +- src/mlpack/core/data/file_reader/parser.hpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mlpack/core/data/file_reader/line_reader.hpp b/src/mlpack/core/data/file_reader/line_reader.hpp index 25675236f0b..9dc39bb6170 100644 --- a/src/mlpack/core/data/file_reader/line_reader.hpp +++ b/src/mlpack/core/data/file_reader/line_reader.hpp @@ -97,7 +97,7 @@ class SynchronousReader{ class LineReader{ private: //blockLen equal to the limit of one line - static constexpr size_t blockLen = 1<<24; + static constexpr int blockLen = 1<<24; detail::SynchronousReader reader; std::vector buffer; diff --git a/src/mlpack/core/data/file_reader/parser.hpp b/src/mlpack/core/data/file_reader/parser.hpp index 7bd785b4fbb..8b0c5be3c80 100644 --- a/src/mlpack/core/data/file_reader/parser.hpp +++ b/src/mlpack/core/data/file_reader/parser.hpp @@ -29,6 +29,8 @@ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. +#include "policy.hpp" + #include #include From 623d70863be62428028d5665c75f5660d86c7e04 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 18:53:17 +0800 Subject: [PATCH 10/42] 1 : refine preprocess guard 2 : add missed preprocess guard --- src/mlpack/core/data/file_reader/csv_reader.hpp | 4 ++-- src/mlpack/core/data/file_reader/line_reader.hpp | 4 ++-- src/mlpack/core/data/file_reader/parser.hpp | 5 +++++ src/mlpack/core/data/file_reader/policy.hpp | 4 ++-- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/mlpack/core/data/file_reader/csv_reader.hpp b/src/mlpack/core/data/file_reader/csv_reader.hpp index d72dbb46be5..003b7cecddf 100644 --- a/src/mlpack/core/data/file_reader/csv_reader.hpp +++ b/src/mlpack/core/data/file_reader/csv_reader.hpp @@ -29,8 +29,8 @@ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. -#ifndef CSV_READER_HPP -#define CSV_READER_HPP +#ifndef MLPACK_CORE_DATA_FILE_READER_CSV_READER_HPP +#define MLPACK_CORE_DATA_FILE_READER_CSV_READER_HPP #include "reader_exceptions.hpp" #include "line_reader.hpp" diff --git a/src/mlpack/core/data/file_reader/line_reader.hpp b/src/mlpack/core/data/file_reader/line_reader.hpp index 9dc39bb6170..fee6daea1db 100644 --- a/src/mlpack/core/data/file_reader/line_reader.hpp +++ b/src/mlpack/core/data/file_reader/line_reader.hpp @@ -29,8 +29,8 @@ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. -#ifndef LINE_READER_HPP -#define LINE_READER_HPP +#ifndef MLPACK_CORE_DATA_FILE_READER_LINE_READER_HPP +#define MLPACK_CORE_DATA_FILE_READER_LINE_READER_HPP #include #include diff --git a/src/mlpack/core/data/file_reader/parser.hpp b/src/mlpack/core/data/file_reader/parser.hpp index 8b0c5be3c80..ae6881b5df0 100644 --- a/src/mlpack/core/data/file_reader/parser.hpp +++ b/src/mlpack/core/data/file_reader/parser.hpp @@ -29,6 +29,9 @@ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. +#ifndef MLPACK_CORE_DATA_FILE_READER_PARSER_HPP +#define MLPACK_CORE_DATA_FILE_READER_PARSER_HPP + #include "policy.hpp" #include @@ -252,3 +255,5 @@ void Parse(char*, T&){ } //namespace io } //namespace mlpack + +#endif diff --git a/src/mlpack/core/data/file_reader/policy.hpp b/src/mlpack/core/data/file_reader/policy.hpp index 00379fb6cbd..b4517efcb91 100644 --- a/src/mlpack/core/data/file_reader/policy.hpp +++ b/src/mlpack/core/data/file_reader/policy.hpp @@ -1,5 +1,5 @@ -#ifndef POLICY_HPP -#define POLICY_HPP +#ifndef MLPACK_CORE_DATA_FILE_READER_POLICY_HPP +#define MLPACK_CORE_DATA_FILE_READER_POLICY_HPP namespace mlpack{ From e1432135ca8b68c0a7b3a3478f0739172ed8df71 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 19:26:31 +0800 Subject: [PATCH 11/42] remove duplicate header --- src/mlpack/core/data/file_reader/csv_reader.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mlpack/core/data/file_reader/csv_reader.hpp b/src/mlpack/core/data/file_reader/csv_reader.hpp index 003b7cecddf..b0cc52c543a 100644 --- a/src/mlpack/core/data/file_reader/csv_reader.hpp +++ b/src/mlpack/core/data/file_reader/csv_reader.hpp @@ -35,7 +35,6 @@ #include "reader_exceptions.hpp" #include "line_reader.hpp" #include "parser.hpp" -#include "policy.hpp" #include #include From 4bb002cbf01ef8ff5f8c148f3e5b4e73d16f94f9 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 19:32:35 +0800 Subject: [PATCH 12/42] remove useless codes --- src/mlpack/core/data/file_reader/policy.hpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/mlpack/core/data/file_reader/policy.hpp b/src/mlpack/core/data/file_reader/policy.hpp index b4517efcb91..34b6bb2a2a0 100644 --- a/src/mlpack/core/data/file_reader/policy.hpp +++ b/src/mlpack/core/data/file_reader/policy.hpp @@ -5,11 +5,6 @@ namespace mlpack{ namespace io{ -//typedef unsigned ignore_column; -//static const ignore_column ignore_no_column = 0; -//static const ignore_column ignore_extra_column = 1; -//static const ignore_column ignore_missing_column = 2; - #include template From 035da37578ec7ee3fa701c9d7c3d6389f58fdf9d Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 19:33:38 +0800 Subject: [PATCH 13/42] refine preprocess guard --- src/mlpack/core/data/file_reader/reader_exceptions.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/file_reader/reader_exceptions.hpp b/src/mlpack/core/data/file_reader/reader_exceptions.hpp index 6008d257c6e..9b6a4c1f820 100644 --- a/src/mlpack/core/data/file_reader/reader_exceptions.hpp +++ b/src/mlpack/core/data/file_reader/reader_exceptions.hpp @@ -29,8 +29,8 @@ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. -#ifndef READER_EXCEPTIONS_HPP -#define READER_EXCEPTIONS_HPP +#ifndef MLPACK_CORE_DATA_FILE_READER_READER_EXCEPTIONS_HPP +#define MLPACK_CORE_DATA_FILE_READER_READER_EXCEPTIONS_HPP #include From 70f292e900a16ade8342fb1fcbda3b706dc58b71 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 19:36:16 +0800 Subject: [PATCH 14/42] add missed header --- src/mlpack/core/data/file_reader/policy.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mlpack/core/data/file_reader/policy.hpp b/src/mlpack/core/data/file_reader/policy.hpp index 34b6bb2a2a0..a5da6c8e106 100644 --- a/src/mlpack/core/data/file_reader/policy.hpp +++ b/src/mlpack/core/data/file_reader/policy.hpp @@ -1,6 +1,8 @@ #ifndef MLPACK_CORE_DATA_FILE_READER_POLICY_HPP #define MLPACK_CORE_DATA_FILE_READER_POLICY_HPP +#include "reader_exceptions.hpp" + namespace mlpack{ namespace io{ From 871042d5184bc9bf4c08dddd739543e147da1027 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 19:36:43 +0800 Subject: [PATCH 15/42] remove duplicate header --- src/mlpack/core/data/file_reader/csv_reader.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mlpack/core/data/file_reader/csv_reader.hpp b/src/mlpack/core/data/file_reader/csv_reader.hpp index b0cc52c543a..c329fc990af 100644 --- a/src/mlpack/core/data/file_reader/csv_reader.hpp +++ b/src/mlpack/core/data/file_reader/csv_reader.hpp @@ -32,7 +32,6 @@ #ifndef MLPACK_CORE_DATA_FILE_READER_CSV_READER_HPP #define MLPACK_CORE_DATA_FILE_READER_CSV_READER_HPP -#include "reader_exceptions.hpp" #include "line_reader.hpp" #include "parser.hpp" From 5a497e17b57eb22f87fbdc66c8d98c059979de58 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 19:36:58 +0800 Subject: [PATCH 16/42] add missed header --- src/mlpack/core/data/file_reader/line_reader.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mlpack/core/data/file_reader/line_reader.hpp b/src/mlpack/core/data/file_reader/line_reader.hpp index fee6daea1db..f4897f5460b 100644 --- a/src/mlpack/core/data/file_reader/line_reader.hpp +++ b/src/mlpack/core/data/file_reader/line_reader.hpp @@ -32,6 +32,8 @@ #ifndef MLPACK_CORE_DATA_FILE_READER_LINE_READER_HPP #define MLPACK_CORE_DATA_FILE_READER_LINE_READER_HPP +#include "reader_exceptions.hpp" + #include #include #include From 9b609218afb8ef38f4bd8a156e033c50c17e4284 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 20:05:35 +0800 Subject: [PATCH 17/42] add license --- src/mlpack/core/data/file_reader/policy.hpp | 31 +++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/mlpack/core/data/file_reader/policy.hpp b/src/mlpack/core/data/file_reader/policy.hpp index a5da6c8e106..27563e499f2 100644 --- a/src/mlpack/core/data/file_reader/policy.hpp +++ b/src/mlpack/core/data/file_reader/policy.hpp @@ -1,3 +1,34 @@ +// Copyright: (2012-2015) Ben Strasser +// License: BSD-3 +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +//2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +//3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + #ifndef MLPACK_CORE_DATA_FILE_READER_POLICY_HPP #define MLPACK_CORE_DATA_FILE_READER_POLICY_HPP From 3059b8391899668abf95f275fc5c9b6c02768779 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 20:11:39 +0800 Subject: [PATCH 18/42] use another way to include header --- src/mlpack/core/data/file_reader/csv_reader.hpp | 4 ++-- src/mlpack/core/data/file_reader/line_reader.hpp | 2 +- src/mlpack/core/data/file_reader/parser.hpp | 2 +- src/mlpack/core/data/file_reader/policy.hpp | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mlpack/core/data/file_reader/csv_reader.hpp b/src/mlpack/core/data/file_reader/csv_reader.hpp index c329fc990af..bf6a2558b2d 100644 --- a/src/mlpack/core/data/file_reader/csv_reader.hpp +++ b/src/mlpack/core/data/file_reader/csv_reader.hpp @@ -32,8 +32,8 @@ #ifndef MLPACK_CORE_DATA_FILE_READER_CSV_READER_HPP #define MLPACK_CORE_DATA_FILE_READER_CSV_READER_HPP -#include "line_reader.hpp" -#include "parser.hpp" +#include +#include #include #include diff --git a/src/mlpack/core/data/file_reader/line_reader.hpp b/src/mlpack/core/data/file_reader/line_reader.hpp index f4897f5460b..4c74684c8cf 100644 --- a/src/mlpack/core/data/file_reader/line_reader.hpp +++ b/src/mlpack/core/data/file_reader/line_reader.hpp @@ -32,7 +32,7 @@ #ifndef MLPACK_CORE_DATA_FILE_READER_LINE_READER_HPP #define MLPACK_CORE_DATA_FILE_READER_LINE_READER_HPP -#include "reader_exceptions.hpp" +#include #include #include diff --git a/src/mlpack/core/data/file_reader/parser.hpp b/src/mlpack/core/data/file_reader/parser.hpp index ae6881b5df0..d62bb6cd44b 100644 --- a/src/mlpack/core/data/file_reader/parser.hpp +++ b/src/mlpack/core/data/file_reader/parser.hpp @@ -32,7 +32,7 @@ #ifndef MLPACK_CORE_DATA_FILE_READER_PARSER_HPP #define MLPACK_CORE_DATA_FILE_READER_PARSER_HPP -#include "policy.hpp" +#include #include #include diff --git a/src/mlpack/core/data/file_reader/policy.hpp b/src/mlpack/core/data/file_reader/policy.hpp index 27563e499f2..c21a71abf3d 100644 --- a/src/mlpack/core/data/file_reader/policy.hpp +++ b/src/mlpack/core/data/file_reader/policy.hpp @@ -32,7 +32,7 @@ #ifndef MLPACK_CORE_DATA_FILE_READER_POLICY_HPP #define MLPACK_CORE_DATA_FILE_READER_POLICY_HPP -#include "reader_exceptions.hpp" +#include namespace mlpack{ From e915439d272f84acdc550bc3212b95badbec12ca Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 6 Aug 2016 20:25:03 +0800 Subject: [PATCH 19/42] use dot template to call the function since original solution can not understand by g++ --- src/mlpack/core/data/file_reader/parser.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/file_reader/parser.hpp b/src/mlpack/core/data/file_reader/parser.hpp index d62bb6cd44b..ae145ebe5d5 100644 --- a/src/mlpack/core/data/file_reader/parser.hpp +++ b/src/mlpack/core/data/file_reader/parser.hpp @@ -115,7 +115,7 @@ void ParseUnsignedInteger(const char *col, T &x){ if('0' <= *col && *col <= '9'){ T y = *col - '0'; if(x > (std::numeric_limits::max()-y)/10){ - OverFlowPolicy::OnOverFlow(x); + OverFlowPolicy().template OnOverFlow(x); return; } x = 10*x+y; @@ -147,7 +147,7 @@ void ParseSignedInteger(const char *col, T &x){ if('0' <= *col && *col <= '9'){ T y = *col - '0'; if(x < (std::numeric_limits::min()+y)/10){ - OverFlowPolicy::OnUnderFlow(x); + OverFlowPolicy().template OnUnderFlow(x); return; } x = 10*x-y; From 42944fecc14d5a5895fdb864bcf4162ef6e4ee3b Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 14 Aug 2016 10:46:22 +0800 Subject: [PATCH 20/42] adjust format --- .../core/data/file_reader/line_reader.hpp | 119 +++++++++--------- 1 file changed, 60 insertions(+), 59 deletions(-) diff --git a/src/mlpack/core/data/file_reader/line_reader.hpp b/src/mlpack/core/data/file_reader/line_reader.hpp index 4c74684c8cf..67dac85371a 100644 --- a/src/mlpack/core/data/file_reader/line_reader.hpp +++ b/src/mlpack/core/data/file_reader/line_reader.hpp @@ -50,7 +50,7 @@ namespace detail{ class OwningStdIOByteSourceBase { -public: + public: explicit OwningStdIOByteSourceBase(std::unique_ptr file): file(std::move(file)){ std::ios_base::sync_with_stdio(false); @@ -66,12 +66,12 @@ class OwningStdIOByteSourceBase std::ios_base::sync_with_stdio(true); } -private: + private: std::unique_ptr file; }; class SynchronousReader{ -public: + public: void Init(std::unique_ptr arg_byte_source){ byteSource = std::move(arg_byte_source); } @@ -88,7 +88,8 @@ class SynchronousReader{ int FinishRead(){ return byteSource->Read(buffer, desiredByteCount); } -private: + + private: std::unique_ptr byteSource; char *buffer; int desiredByteCount; @@ -97,61 +98,7 @@ class SynchronousReader{ } //namespace details class LineReader{ -private: - //blockLen equal to the limit of one line - static constexpr int blockLen = 1<<24; - - detail::SynchronousReader reader; - std::vector buffer; - int dataBegin; - int dataEnd; - int lineLength; - - std::string fileName; - size_t fileLine; - - static std::unique_ptr OpenFile(const char *file_name) - { - std::unique_ptr file(new std::ifstream(file_name, std::ios::binary)); - if(!file->is_open()){ - error::CanNotOpenFile err; - err.FileName(file_name); - throw err; - } - - return std::unique_ptr - (new detail::OwningStdIOByteSourceBase(std::move(file))); - } - - void Init(std::unique_ptr byteSource) - { - fileLine = 0; - lineLength = 0; - - //Allocate 48MBytes to store char of files - //First block store the string we want to handle - //Second block store the extra string to handle - //after the First block is consumed - //Third block is the "prepare block", use to read more - //data from the file - buffer.resize(3*blockLen); - dataBegin = 0; - dataEnd = byteSource->Read(&buffer[0], 2*blockLen); - - //Ignore UTF-8 BOM - if(dataEnd >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF'){ - dataBegin = 3; - } - - //If the data of file is >= 2*blockLen, we need to do - //the prepare of reading more data - if(dataEnd == 2*blockLen){ - reader.Init(std::move(byteSource)); - reader.PrepareRead(&buffer[0] + 2*blockLen, blockLen); - } - } - -public: + public: LineReader() = delete; LineReader(const LineReader&) = delete; LineReader&operator=(const LineReader&) = delete; @@ -251,6 +198,60 @@ class LineReader{ dataBegin = lineEnd + 1; return ret; } + + private: + //blockLen equal to the limit of one line + static constexpr int blockLen = 1<<24; + + detail::SynchronousReader reader; + std::vector buffer; + int dataBegin; + int dataEnd; + int lineLength; + + std::string fileName; + size_t fileLine; + + static std::unique_ptr OpenFile(const char *file_name) + { + std::unique_ptr file(new std::ifstream(file_name, std::ios::binary)); + if(!file->is_open()){ + error::CanNotOpenFile err; + err.FileName(file_name); + throw err; + } + + return std::unique_ptr + (new detail::OwningStdIOByteSourceBase(std::move(file))); + } + + void Init(std::unique_ptr byteSource) + { + fileLine = 0; + lineLength = 0; + + //Allocate 48MBytes to store char of files + //First block store the string we want to handle + //Second block store the extra string to handle + //after the First block is consumed + //Third block is the "prepare block", use to read more + //data from the file + buffer.resize(3*blockLen); + dataBegin = 0; + dataEnd = byteSource->Read(&buffer[0], 2*blockLen); + + //Ignore UTF-8 BOM + if(dataEnd >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF'){ + dataBegin = 3; + } + + //If the data of file is >= 2*blockLen, we need to do + //the prepare of reading more data + if(dataEnd == 2*blockLen){ + reader.Init(std::move(byteSource)); + reader.PrepareRead(&buffer[0] + 2*blockLen, blockLen); + } + } }; } //namespace io From 802b21abd344985ac41fb0cdf74177af2fb2114b Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 14 Aug 2016 10:47:43 +0800 Subject: [PATCH 21/42] refine preprocessor guard --- .../core/data/file_reader/csv_reader.hpp | 10 +- .../data/file_reader/reader_exceptions.hpp | 130 +++++++++++------- 2 files changed, 89 insertions(+), 51 deletions(-) diff --git a/src/mlpack/core/data/file_reader/csv_reader.hpp b/src/mlpack/core/data/file_reader/csv_reader.hpp index bf6a2558b2d..6d1b2011863 100644 --- a/src/mlpack/core/data/file_reader/csv_reader.hpp +++ b/src/mlpack/core/data/file_reader/csv_reader.hpp @@ -32,8 +32,10 @@ #ifndef MLPACK_CORE_DATA_FILE_READER_CSV_READER_HPP #define MLPACK_CORE_DATA_FILE_READER_CSV_READER_HPP -#include -#include +#include "reader_exceptions.hpp" +#include "line_reader.hpp" +#include "parser.hpp" +#include "policy.hpp" #include #include @@ -186,7 +188,7 @@ class CSVReader{ if(row[r]){ try{ try{ - Parse(row[r], t); + ::io::Parse(row[r], t); }catch(error::WithColumnContent&err){ err.ColumnContent(row[r]); throw; @@ -207,7 +209,7 @@ class CSVReader{ try{ while(begin != end){ if(row[r]){ - Parse(row[r++], *begin); + ::io::Parse(row[r++], *begin); } ++begin; } diff --git a/src/mlpack/core/data/file_reader/reader_exceptions.hpp b/src/mlpack/core/data/file_reader/reader_exceptions.hpp index 9b6a4c1f820..1d3e3ff3743 100644 --- a/src/mlpack/core/data/file_reader/reader_exceptions.hpp +++ b/src/mlpack/core/data/file_reader/reader_exceptions.hpp @@ -32,8 +32,6 @@ #ifndef MLPACK_CORE_DATA_FILE_READER_READER_EXCEPTIONS_HPP #define MLPACK_CORE_DATA_FILE_READER_READER_EXCEPTIONS_HPP -#include - #include #include #include @@ -53,10 +51,10 @@ struct Base : std::exception const char* what()const throw() { FormatErrorMessage(); - return errorMessageBuffer.c_str(); + return errorMessageBuffer; } - mutable std::string errorMessageBuffer; + mutable char errorMessageBuffer[256]; }; struct WithFileName @@ -87,13 +85,36 @@ struct WithFileLine size_t fileLine; }; +struct WithErrno +{ + WithErrno(){ + errnoValue = 0; + } + + void Errno(int errno_value) + { + this->errnoValue = errnoValue; + } + + int errnoValue; +}; + struct CanNotOpenFile : Base, - WithFileName + WithFileName, + WithErrno { void FormatErrorMessage()const { - errorMessageBuffer = "Can not open file \"" + fileName + "\""; + if(errnoValue != 0){ + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Can not open file \"%s\" because \"%s\"." + ,&fileName[0], std::strerror(errnoValue)); + }else{ + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Can not open file \"%s\"." + ,&fileName[0]); + } } }; @@ -104,28 +125,42 @@ struct LineLengthLimitExceeded : { void FormatErrorMessage()const { - errorMessageBuffer = "Line length " +std::to_string(fileLine) + - " in file \"" + fileName + "\" exceeds the maximum length of 2^24-1."; + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Line length %d in file \"%s\" exceeds the maximum length of 2^24-1." + , fileLine, &fileName[0]); } }; struct WithColumnName { + WithColumnName() + { + std::fill(std::begin(columnName), std::end(columnName), 0); + } + void ColumnName(const char* columnName) { - this->columnName = columnName; + std::strncpy(this->columnName, columnName, maxColumnNameLength); + this->columnName[maxColumnNameLength] = '\0'; } - std::string columnName; + static constexpr int maxColumnNameLength = 63; + char columnName[maxColumnNameLength+1]; }; struct WithColumnContent { + WithColumnContent(){ + std::memset(columnContent, 0, maxColumnContentLength+1); + } + void ColumnContent(const char *columnContent){ - this->columnContent = columnContent; + std::strncpy(this->columnContent, columnContent, maxColumnContentLength); + this->columnContent[maxColumnContentLength] = '\0'; } - std::string columnContent; + static constexpr int maxColumnContentLength = 63; + char columnContent[maxColumnContentLength+1]; }; @@ -136,8 +171,9 @@ struct ExtraColumnInHeader : { void FormatErrorMessage()const { - errorMessageBuffer = std::string("Extra column \"") + columnName + - "\" in header of file \"" + fileName + "\""; + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Extra column \"%s\" in header of file \"%s\"." + , columnName, &fileName[0]); } }; @@ -147,8 +183,9 @@ struct MissingColumnInHeader : WithColumnName{ void FormatErrorMessage()const { - errorMessageBuffer = (boost::format("Missing column \"%1%\" in header of file \"%2%\".") - % columnName % fileName).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Missing column \"%s\" in header of file \"%s\"." + , columnName, &fileName[0]); } }; @@ -159,8 +196,9 @@ struct DuplicatedColumnInHeader : { void FormatErrorMessage()const { - errorMessageBuffer= (boost::format("Duplicated column \"%1%\" in header of file \"%2%\".") - % columnName % fileName).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Duplicated column \"%s\" in header of file \"%s\"." + , columnName, &fileName[0]); } }; @@ -170,8 +208,9 @@ struct HeaderMissing : { void FormatErrorMessage()const { - errorMessageBuffer= (boost::format("Header missing in file \"%1%\".") - % fileName).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Header missing in file \"%s\"." + , &fileName[0]); } }; @@ -182,8 +221,9 @@ struct TooFewColumns : { void FormatErrorMessage()const { - errorMessageBuffer= (boost::format("Too few columns in line %1% in file \"%2%\".") - % fileLine % fileName).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Too few columns in line %d in file \"%s\"." + , fileLine, &fileName[0]); } }; @@ -194,8 +234,9 @@ struct TooManyColumns : { void FormatErrorMessage()const { - errorMessageBuffer= (boost::format("Too many columns in line %1% in file \"%2%\".") - % fileLine % fileName).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Too many columns in line %d in file \"%s\"." + , fileLine, &fileName[0]); } }; @@ -206,8 +247,9 @@ struct EscapedStringNotClosed : { void FormatErrorMessage()const { - errorMessageBuffer= (boost::format("Escaped string was not closed in line %1% in file \"%2%\".") - % fileLine % fileName).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "Escaped string was not closed in line %d in file \"%s\"." + , fileLine, &fileName[0]); } }; @@ -220,10 +262,9 @@ struct IntegerMustBePositive : { void FormatErrorMessage()const { - errorMessageBuffer= (boost::format("The integer \"%1%\" must be positive or 0 in " - "column \"%2%\" in file \"%3%\" in line \"%4%\".") - % columnContent % columnName % - fileName % fileLine).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "The integer \"%s\" must be positive or 0 in column \"%s\" in file \"%s\" in line \"%d\"." + , columnContent, columnName, &fileName[0], fileLine); } }; @@ -236,10 +277,9 @@ struct NoDigit : { void FormatErrorMessage()const { - errorMessageBuffer= (boost::format("The integer \"%1%\" contains an invalid digit in column " - "\"%2%\" in file \"%3%\" in line \"%4%\".") - % columnContent % columnName % - fileName % fileLine).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "The integer \"%s\" contains an invalid digit in column \"%s\" in file \"%s\" in line \"%d\"." + , columnContent, columnName, &fileName[0], fileLine); } }; @@ -252,10 +292,9 @@ struct IntegerOverflow : { void FormatErrorMessage()const { - errorMessageBuffer= (boost::format("The integer \"%1%\" overflows in column \"%2%\" " - "in file \"%3%\" in line \"%4%\".") - % columnContent % columnName % - fileName % fileLine).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "The integer \"%s\" overflows in column \"%s\" in file \"%s\" in line \"%d\"." + , columnContent, columnName, &fileName[0], fileLine); } }; @@ -268,10 +307,9 @@ struct IntegerUnderflow : { void FormatErrorMessage()const { - errorMessageBuffer= (boost::format("The integer \"%1%\" underflows in column \"%2%\" " - "in file \"%3%\" in line \"%4%\".") - % columnContent % columnName % - fileName % fileLine).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "The integer \"%s\" underflows in column \"%s\" in file \"%s\" in line \"%d\"." + , columnContent, columnName, &fileName[0], fileLine); } }; @@ -284,11 +322,9 @@ struct InvalidSingleCharacter : { void FormatErrorMessage()const { - errorMessageBuffer= (boost::format("The content \"%1%\" of column \"%2%\" " - "in file \"%3%\", in line \"%4%\" is not a " - "single character.") - % columnContent % columnName % - fileName % fileLine).str(); + std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), + "The content \"%s\" of column \"%s\" in file \"%s\" in line \"%d\" is not a single character." + , columnContent, columnName, &fileName[0], fileLine); } }; From 6d74a65c9c46f97a0e06bd8d69236c02c3cf17dd Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 14 Aug 2016 16:13:05 +0800 Subject: [PATCH 22/42] adjust format --- src/mlpack/core/data/file_reader/policy.hpp | 42 ++++++++++----------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/mlpack/core/data/file_reader/policy.hpp b/src/mlpack/core/data/file_reader/policy.hpp index c21a71abf3d..c69a294ddd7 100644 --- a/src/mlpack/core/data/file_reader/policy.hpp +++ b/src/mlpack/core/data/file_reader/policy.hpp @@ -43,19 +43,7 @@ namespace io{ template struct TrimChars { -private: - constexpr static bool IsTrimChar(char) - { - return false; - } - - template - constexpr static bool IsTrimChar(char c, char trimChar, OtherTrimChars...otherTrimChars) - { - return c == trimChar || IsTrimChar(c, otherTrimChars...); - } - -public: + public: static void Trim(char*&strBegin, char*&strEnd) { while(strBegin != strEnd && IsTrimChar(*strBegin, TrimCharList...)) @@ -68,6 +56,18 @@ struct TrimChars } *strEnd = '\0'; } + + private: + constexpr static bool IsTrimChar(char) + { + return false; + } + + template + constexpr static bool IsTrimChar(char c, char trimChar, OtherTrimChars...otherTrimChars) + { + return c == trimChar || IsTrimChar(c, otherTrimChars...); + } }; @@ -81,8 +81,14 @@ struct NoComment template struct SingleLineComment -{ -private: +{ + public: + static bool IsComment(const char* line) + { + return IsCommentStartChar(*line, CommentStartCharList...); + } + + private: constexpr static bool IsCommentStartChar(char) { return false; @@ -94,12 +100,6 @@ struct SingleLineComment { return c == commentstartchar || IsCommentStartChar(c, othercommentstartchars...); } - -public: - static bool IsComment(const char* line) - { - return IsCommentStartChar(*line, CommentStartCharList...); - } }; struct EmptyLineComment From f67fa111bb03391337c5a249ac80263c79f03ad7 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 14 Aug 2016 16:57:16 +0800 Subject: [PATCH 23/42] fix bug--function name errors --- src/mlpack/core/data/file_reader/parser.hpp | 6 +++--- src/mlpack/core/data/file_reader/policy.hpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mlpack/core/data/file_reader/parser.hpp b/src/mlpack/core/data/file_reader/parser.hpp index ae145ebe5d5..1cceb35e945 100644 --- a/src/mlpack/core/data/file_reader/parser.hpp +++ b/src/mlpack/core/data/file_reader/parser.hpp @@ -114,8 +114,8 @@ void ParseUnsignedInteger(const char *col, T &x){ while(*col != '\0'){ if('0' <= *col && *col <= '9'){ T y = *col - '0'; - if(x > (std::numeric_limits::max()-y)/10){ - OverFlowPolicy().template OnOverFlow(x); + if(x > (std::numeric_limits::max()-y)/10){ + OverFlowPolicy::OnOverFlow(x); return; } x = 10*x+y; @@ -147,7 +147,7 @@ void ParseSignedInteger(const char *col, T &x){ if('0' <= *col && *col <= '9'){ T y = *col - '0'; if(x < (std::numeric_limits::min()+y)/10){ - OverFlowPolicy().template OnUnderFlow(x); + OverFlowPolicy::OnOverFlow(x); return; } x = 10*x-y; diff --git a/src/mlpack/core/data/file_reader/policy.hpp b/src/mlpack/core/data/file_reader/policy.hpp index c69a294ddd7..e8f519241e8 100644 --- a/src/mlpack/core/data/file_reader/policy.hpp +++ b/src/mlpack/core/data/file_reader/policy.hpp @@ -197,13 +197,13 @@ struct DoubleQuoteEscape struct ThrowOnOverflow { template - static void OnOverflow(T&) + static void OnOverFlow(T&) { throw error::IntegerOverflow(); } template - static void OnUnderflow(T&) + static void OnUnderFlow(T&) { throw error::IntegerUnderflow(); } From da6ac2ea25bba5a0261cd0783e3e33ce4515d48d Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 14 Aug 2016 17:00:42 +0800 Subject: [PATCH 24/42] refine class name --- src/mlpack/core/data/file_reader/csv_reader.hpp | 2 +- src/mlpack/core/data/file_reader/policy.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/file_reader/csv_reader.hpp b/src/mlpack/core/data/file_reader/csv_reader.hpp index 6d1b2011863..48c903fa2fc 100644 --- a/src/mlpack/core/data/file_reader/csv_reader.hpp +++ b/src/mlpack/core/data/file_reader/csv_reader.hpp @@ -48,7 +48,7 @@ namespace io{ template, class QuotePolicy = NoQuoteEscape<','>, - class OverflowPolicy = ThrowOnOverflow, + class OverflowPolicy = ThrowOnOverFlow, class CommentPolicy = NoComment > class CSVReader{ diff --git a/src/mlpack/core/data/file_reader/policy.hpp b/src/mlpack/core/data/file_reader/policy.hpp index e8f519241e8..414244949a3 100644 --- a/src/mlpack/core/data/file_reader/policy.hpp +++ b/src/mlpack/core/data/file_reader/policy.hpp @@ -194,7 +194,7 @@ struct DoubleQuoteEscape } }; -struct ThrowOnOverflow +struct ThrowOnOverFlow { template static void OnOverFlow(T&) From 08739c225bf4978e00cebc1aae2efa1e9587536a Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 14 Aug 2016 17:22:23 +0800 Subject: [PATCH 25/42] fix type cast warning --- src/mlpack/core/data/file_reader/parser.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/file_reader/parser.hpp b/src/mlpack/core/data/file_reader/parser.hpp index 1cceb35e945..9628d0ef3fd 100644 --- a/src/mlpack/core/data/file_reader/parser.hpp +++ b/src/mlpack/core/data/file_reader/parser.hpp @@ -211,7 +211,7 @@ void ParseFloat(const char *col, T &x){ if(e != 0){ T base; if(e < 0){ - base = 0.1; + base = static_cast(0.1); e = -e; }else{ base = 10; From 26f911a8548f15965a0d6852e796133f04b0086e Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 14 Aug 2016 17:29:06 +0800 Subject: [PATCH 26/42] add test for fast csv parser --- src/mlpack/tests/CMakeLists.txt | 1 + src/mlpack/tests/file_reader_parser_test.cpp | 129 +++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 src/mlpack/tests/file_reader_parser_test.cpp diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index b13494dff09..409d6770396 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -19,6 +19,7 @@ add_executable(mlpack_test emst_test.cpp fastmks_test.cpp feedforward_network_test.cpp + file_reader_parser_test.cpp gmm_test.cpp hmm_test.cpp hoeffding_tree_test.cpp diff --git a/src/mlpack/tests/file_reader_parser_test.cpp b/src/mlpack/tests/file_reader_parser_test.cpp new file mode 100644 index 00000000000..f173dfad83c --- /dev/null +++ b/src/mlpack/tests/file_reader_parser_test.cpp @@ -0,0 +1,129 @@ +/** + * @file file_reader_parser_test.cpp + * @author Ngap Wei Tham + * + * Test the parsers of fast csv + */ + +#include + +#include + +#include +#include "test_tools.hpp" + +#include +#include +#include +#include + +using namespace mlpack; +using namespace arma; + +BOOST_AUTO_TEST_SUITE(randomForest); + +template +void TestFloatParser() +{ + T num = 0; + io::Parse("10", num); + BOOST_REQUIRE_CLOSE(num, 10, 1e-5); + io::Parse("-10", num); + BOOST_REQUIRE_CLOSE(num, -10, 1e-5); + + io::Parse("10.01", num); + BOOST_REQUIRE_CLOSE(num, 10.01, 1e-5); + io::Parse("-10.01", num); + BOOST_REQUIRE_CLOSE(num, -10.01, 1e-5); + + io::Parse("10.05e3", num); + BOOST_REQUIRE_CLOSE(num, 10.05e3, 1e-5); + io::Parse("-10.05e3", num); + BOOST_REQUIRE_CLOSE(num, -10.05e3, 1e-5); + io::Parse("10.05e-3", num); + BOOST_REQUIRE_CLOSE(num, 10.05e-3, 1e-5); + io::Parse("-10.05e-3", num); + BOOST_REQUIRE_CLOSE(num, -10.05e-3, 1e-5); + + io::Parse("10.05E3", num); + BOOST_REQUIRE_CLOSE(num, 10.05e3, 1e-5); + io::Parse("-10.05E3", num); + BOOST_REQUIRE_CLOSE(num, -10.05e3, 1e-5); + io::Parse("10.05E-3", num); + BOOST_REQUIRE_CLOSE(num, 10.05e-3, 1E-5); + io::Parse("-10.05E-3", num); + BOOST_REQUIRE_CLOSE(num, -10.05e-3, 1e-5); +} + +BOOST_AUTO_TEST_CASE(ParseUnsignedIntTest) +{ + unsigned char num_char = 0; + io::Parse("200", num_char); + BOOST_REQUIRE_EQUAL(num_char, 200); + + unsigned short num_short = 0; + io::Parse("300", num_short); + BOOST_REQUIRE_EQUAL(num_short, 300); + + unsigned int num_int = 0; + io::Parse("400", num_int); + BOOST_REQUIRE_EQUAL(num_int, 400); + + unsigned long num_ulong = 0; + io::Parse("500", num_ulong); + BOOST_REQUIRE_EQUAL(num_ulong, 500); + + unsigned long long num_ullong = 0; + io::Parse("600", num_ullong); + BOOST_REQUIRE_EQUAL(num_ullong, 600); + + size_t num_size_t = 0; + io::Parse("700", num_size_t); + BOOST_REQUIRE_EQUAL(num_size_t, 700); +} + +BOOST_AUTO_TEST_CASE(ParseSignedIntTest) +{ + signed char num_char = 0; + io::Parse("10", num_char); + BOOST_REQUIRE_EQUAL(num_char, 10); + + short num_short = 0; + io::Parse("300", num_short); + BOOST_REQUIRE_EQUAL(num_short, 300); + + int num_int = 0; + io::Parse("400", num_int); + BOOST_REQUIRE_EQUAL(num_int, 400); + + long num_ulong = 0; + io::Parse("500", num_ulong); + BOOST_REQUIRE_EQUAL(num_ulong, 500); + + long long num_ullong = 0; + io::Parse("600", num_ullong); + BOOST_REQUIRE_EQUAL(num_ullong, 600); +} + +BOOST_AUTO_TEST_CASE(ParseFloatTest) +{ + TestFloatParser(); + TestFloatParser(); + TestFloatParser(); +} + +BOOST_AUTO_TEST_CASE(ParseStringTest) +{ + std::string str; + + io::Parse("600", str); + BOOST_REQUIRE_EQUAL("600", str); + + io::Parse("600 88976", str); + BOOST_REQUIRE_EQUAL("600 88976", str); + + io::Parse("hh 600 mm gg", str); + BOOST_REQUIRE_EQUAL("hh 600 mm gg", str); +} + +BOOST_AUTO_TEST_SUITE_END(); From 9487e0b358c9fbff7ce5da19247d2a971c689e80 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Fri, 26 Aug 2016 09:10:21 +0800 Subject: [PATCH 27/42] fix bug--wrong namespace --- src/mlpack/core/data/file_reader/csv_reader.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/file_reader/csv_reader.hpp b/src/mlpack/core/data/file_reader/csv_reader.hpp index 48c903fa2fc..5260113b3ea 100644 --- a/src/mlpack/core/data/file_reader/csv_reader.hpp +++ b/src/mlpack/core/data/file_reader/csv_reader.hpp @@ -188,7 +188,7 @@ class CSVReader{ if(row[r]){ try{ try{ - ::io::Parse(row[r], t); + Parse(row[r], t); }catch(error::WithColumnContent&err){ err.ColumnContent(row[r]); throw; @@ -209,7 +209,7 @@ class CSVReader{ try{ while(begin != end){ if(row[r]){ - ::io::Parse(row[r++], *begin); + Parse(row[r++], *begin); } ++begin; } From 1e13c4b771003520045f72d08dec6539bd890925 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Fri, 26 Aug 2016 11:25:29 +0800 Subject: [PATCH 28/42] fix warnings --- .../data/file_reader/reader_exceptions.hpp | 117 +++++++----------- 1 file changed, 46 insertions(+), 71 deletions(-) diff --git a/src/mlpack/core/data/file_reader/reader_exceptions.hpp b/src/mlpack/core/data/file_reader/reader_exceptions.hpp index 1d3e3ff3743..b6d2ddff0af 100644 --- a/src/mlpack/core/data/file_reader/reader_exceptions.hpp +++ b/src/mlpack/core/data/file_reader/reader_exceptions.hpp @@ -51,10 +51,10 @@ struct Base : std::exception const char* what()const throw() { FormatErrorMessage(); - return errorMessageBuffer; + return errorMessageBuffer.c_str(); } - mutable char errorMessageBuffer[256]; + mutable std::string errorMessageBuffer; }; struct WithFileName @@ -62,7 +62,7 @@ struct WithFileName WithFileName(){ } - void FileName(const char*file_name) + void FileName(const char* file_name) { fileName = file_name; } @@ -87,11 +87,11 @@ struct WithFileLine struct WithErrno { - WithErrno(){ - errnoValue = 0; + WithErrno() : errnoValue(0) + { } - void Errno(int errno_value) + void Errno(int errnoValue) { this->errnoValue = errnoValue; } @@ -106,14 +106,9 @@ struct CanNotOpenFile : { void FormatErrorMessage()const { + errorMessageBuffer = "Can not open file [" + fileName + "]"; if(errnoValue != 0){ - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Can not open file \"%s\" because \"%s\"." - ,&fileName[0], std::strerror(errnoValue)); - }else{ - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Can not open file \"%s\"." - ,&fileName[0]); + errorMessageBuffer += " because [" + std::string(std::strerror(errnoValue)) + "]."; } } }; @@ -125,42 +120,28 @@ struct LineLengthLimitExceeded : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Line length %d in file \"%s\" exceeds the maximum length of 2^24-1." - , fileLine, &fileName[0]); + errorMessageBuffer = "Line length " + std::to_string(fileLine) + + " in file [" + fileName + "] exceeds the maximum length of 2^24-1."; } }; struct WithColumnName -{ - WithColumnName() - { - std::fill(std::begin(columnName), std::end(columnName), 0); - } - +{ void ColumnName(const char* columnName) { - std::strncpy(this->columnName, columnName, maxColumnNameLength); - this->columnName[maxColumnNameLength] = '\0'; + this->columnName = columnName; } - static constexpr int maxColumnNameLength = 63; - char columnName[maxColumnNameLength+1]; + std::string columnName; }; struct WithColumnContent -{ - WithColumnContent(){ - std::memset(columnContent, 0, maxColumnContentLength+1); - } - +{ void ColumnContent(const char *columnContent){ - std::strncpy(this->columnContent, columnContent, maxColumnContentLength); - this->columnContent[maxColumnContentLength] = '\0'; + this->columnContent = columnContent; } - static constexpr int maxColumnContentLength = 63; - char columnContent[maxColumnContentLength+1]; + std::string columnContent; }; @@ -171,9 +152,8 @@ struct ExtraColumnInHeader : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Extra column \"%s\" in header of file \"%s\"." - , columnName, &fileName[0]); + errorMessageBuffer = "Extra column [" + columnName + + "] in header of file [" + fileName + "]."; } }; @@ -183,9 +163,8 @@ struct MissingColumnInHeader : WithColumnName{ void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Missing column \"%s\" in header of file \"%s\"." - , columnName, &fileName[0]); + errorMessageBuffer = "Missing column [" + columnName + + "] in header of file [" + fileName + "]."; } }; @@ -196,9 +175,8 @@ struct DuplicatedColumnInHeader : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Duplicated column \"%s\" in header of file \"%s\"." - , columnName, &fileName[0]); + errorMessageBuffer = "Duplicated column [" + columnName + + "] in header of file [" + fileName + "]."; } }; @@ -208,9 +186,7 @@ struct HeaderMissing : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Header missing in file \"%s\"." - , &fileName[0]); + errorMessageBuffer = "Header missing in file [" + fileName + "]."; } }; @@ -221,9 +197,8 @@ struct TooFewColumns : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Too few columns in line %d in file \"%s\"." - , fileLine, &fileName[0]); + errorMessageBuffer = "Too few columns in line " + + std::to_string(fileLine) + "] in file [" + fileName + "]."; } }; @@ -234,9 +209,8 @@ struct TooManyColumns : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Too many columns in line %d in file \"%s\"." - , fileLine, &fileName[0]); + errorMessageBuffer = "Too many columns in line " + + std::to_string(fileLine) + "] in file [" + fileName + "]."; } }; @@ -247,9 +221,8 @@ struct EscapedStringNotClosed : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "Escaped string was not closed in line %d in file \"%s\"." - , fileLine, &fileName[0]); + errorMessageBuffer = "Escaped string was not closed in line " + + std::to_string(fileLine) + "] in file [" + fileName + "]."; } }; @@ -262,9 +235,10 @@ struct IntegerMustBePositive : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "The integer \"%s\" must be positive or 0 in column \"%s\" in file \"%s\" in line \"%d\"." - , columnContent, columnName, &fileName[0], fileLine); + errorMessageBuffer = "The integer [" + columnContent + "] must be positive " + "or 0 in column [" + columnName + "] " + + "in file [" + fileName + "] in line [" + + std::to_string(fileLine) + "]."; } }; @@ -277,9 +251,10 @@ struct NoDigit : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "The integer \"%s\" contains an invalid digit in column \"%s\" in file \"%s\" in line \"%d\"." - , columnContent, columnName, &fileName[0], fileLine); + errorMessageBuffer = "The integer [" + columnContent + "] contains an invalid " + "digit in column [" + columnName + "] " + + "in file [" + fileName + "] in line [" + + std::to_string(fileLine) + "]."; } }; @@ -292,9 +267,9 @@ struct IntegerOverflow : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "The integer \"%s\" overflows in column \"%s\" in file \"%s\" in line \"%d\"." - , columnContent, columnName, &fileName[0], fileLine); + errorMessageBuffer = "The integer [" + columnContent + "] overflows in column " + "[" + columnName + "] " + "in file [" + fileName + "] " + "in line [" + std::to_string(fileLine) + "]."; } }; @@ -307,9 +282,9 @@ struct IntegerUnderflow : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "The integer \"%s\" underflows in column \"%s\" in file \"%s\" in line \"%d\"." - , columnContent, columnName, &fileName[0], fileLine); + errorMessageBuffer = "The integer [" + columnContent + "] underflows in column " + "[" + columnName + "] " + "in file [" + fileName + "] " + "in line [" + std::to_string(fileLine) + "]."; } }; @@ -322,9 +297,9 @@ struct InvalidSingleCharacter : { void FormatErrorMessage()const { - std::snprintf(errorMessageBuffer, sizeof(errorMessageBuffer), - "The content \"%s\" of column \"%s\" in file \"%s\" in line \"%d\" is not a single character." - , columnContent, columnName, &fileName[0], fileLine); + errorMessageBuffer = "The content [" + columnContent + "] of column " + "[" + columnName + "] " + "in file [" + fileName + "] " + "in line [" + std::to_string(fileLine) + "]."; } }; From 14e828368d0074df434519bd34b4c39bcf7c57cd Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 27 Aug 2016 06:05:23 +0800 Subject: [PATCH 29/42] include missed header --- src/mlpack/core/data/file_reader/reader_exceptions.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mlpack/core/data/file_reader/reader_exceptions.hpp b/src/mlpack/core/data/file_reader/reader_exceptions.hpp index b6d2ddff0af..5e5c03ae6b4 100644 --- a/src/mlpack/core/data/file_reader/reader_exceptions.hpp +++ b/src/mlpack/core/data/file_reader/reader_exceptions.hpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include From 5687076751e53acbe81aed100cb9530d1fbe3eaa Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 27 Aug 2016 06:45:31 +0800 Subject: [PATCH 30/42] const correctness --- src/mlpack/core/data/file_reader/parser.hpp | 36 ++++++++++----------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/mlpack/core/data/file_reader/parser.hpp b/src/mlpack/core/data/file_reader/parser.hpp index 9628d0ef3fd..db5de3b271e 100644 --- a/src/mlpack/core/data/file_reader/parser.hpp +++ b/src/mlpack/core/data/file_reader/parser.hpp @@ -69,7 +69,7 @@ void ParseLine( if(line == nullptr){ throw error::TooFewColumns(); } - char*col_begin, *col_end; + char *col_begin, *col_end; ChopNextColumn(line, col_begin, col_end); if(colOrder[i] != -1){ @@ -84,7 +84,7 @@ void ParseLine( } template -void Parse(char *col, char &x){ +void Parse(const char *col, char &x){ if(!*col) throw error::InvalidSingleCharacter(); x = *col; @@ -94,12 +94,12 @@ void Parse(char *col, char &x){ } template -void Parse(char *col, std::string &x){ +void Parse(const char *col, std::string &x){ x = col; } template -void Parse(char* col, const char*& x){ +void Parse(const char* col, const char*& x){ x = col; } @@ -126,15 +126,15 @@ void ParseUnsignedInteger(const char *col, T &x){ } } -templatevoid Parse(char *col, unsigned char &x) +templatevoid Parse(const char *col, unsigned char &x) {ParseUnsignedInteger(col, x);} -templatevoid Parse(char *col, unsigned short &x) +templatevoid Parse(const char *col, unsigned short &x) {ParseUnsignedInteger(col, x);} -templatevoid Parse(char *col, unsigned int &x) +templatevoid Parse(const char *col, unsigned int &x) {ParseUnsignedInteger(col, x);} -templatevoid Parse(char *col, unsigned long &x) +templatevoid Parse(const char *col, unsigned long &x) {ParseUnsignedInteger(col, x);} -templatevoid Parse(char *col, unsigned long long &x) +templatevoid Parse(const char *col, unsigned long long &x) {ParseUnsignedInteger(col, x);} template @@ -162,15 +162,15 @@ void ParseSignedInteger(const char *col, T &x){ ParseUnsignedInteger(col, x); } -templatevoid Parse(char *col, signed char &x) +templatevoid Parse(const char *col, signed char &x) {ParseSignedInteger(col, x);} -templatevoid Parse(char *col, signed short &x) +templatevoid Parse(const char *col, signed short &x) {ParseSignedInteger(col, x);} -templatevoid Parse(char *col, signed int &x) +templatevoid Parse(const char *col, signed int &x) {ParseSignedInteger(col, x);} -templatevoid Parse(char *col, signed long &x) +templatevoid Parse(const char *col, signed long &x) {ParseSignedInteger(col, x);} -templatevoid Parse(char *col, signed long long &x) +templatevoid Parse(const char *col, signed long long &x) {ParseSignedInteger(col, x);} template @@ -238,12 +238,12 @@ void ParseFloat(const char *col, T &x){ x = -x; } -template void Parse(char *col, float &x) { ParseFloat(col, x); } -template void Parse(char *col, double &x) { ParseFloat(col, x); } -template void Parse(char *col, long double &x) { ParseFloat(col, x); } +template void Parse(const char *col, float &x) { ParseFloat(col, x); } +template void Parse(const char *col, double &x) { ParseFloat(col, x); } +template void Parse(const char *col, long double &x) { ParseFloat(col, x); } template -void Parse(char*, T&){ +void Parse(const char*, T&){ // GCC evalutes "false" when reading the template and // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why // this strange construct is used. From cc87e0875ad2550b13b65da6752ea3d192998e4c Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sun, 4 Sep 2016 02:45:55 +0800 Subject: [PATCH 31/42] add test cases for csv_reader --- src/mlpack/tests/CMakeLists.txt | 1 + .../tests/file_reader_csv_reader_test.cpp | 72 +++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 src/mlpack/tests/file_reader_csv_reader_test.cpp diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt index 409d6770396..cbbecd1c7db 100644 --- a/src/mlpack/tests/CMakeLists.txt +++ b/src/mlpack/tests/CMakeLists.txt @@ -19,6 +19,7 @@ add_executable(mlpack_test emst_test.cpp fastmks_test.cpp feedforward_network_test.cpp + file_reader_csv_reader_test.cpp file_reader_parser_test.cpp gmm_test.cpp hmm_test.cpp diff --git a/src/mlpack/tests/file_reader_csv_reader_test.cpp b/src/mlpack/tests/file_reader_csv_reader_test.cpp new file mode 100644 index 00000000000..45e4464c696 --- /dev/null +++ b/src/mlpack/tests/file_reader_csv_reader_test.cpp @@ -0,0 +1,72 @@ +/** + * @file file_reader_parser_test.cpp + * @author Ngap Wei Tham + * + * Test the parsers of fast csv + */ + +#include + +#include + +#include +#include "test_tools.hpp" + +#include +#include +#include +#include + +using namespace mlpack; +using namespace arma; + +BOOST_AUTO_TEST_SUITE(CSVReaderTest); + +BOOST_AUTO_TEST_CASE(ReadHomogeneousRowTest) +{ + std::fstream f; + f.open("test.csv", std::fstream::out); + f << "1, 2, hello \n"; + f << "hello , goodbye, coffe \n"; + f.close(); + + io::CSVReader<> reader(3, "test.csv"); + std::vector elems(3); + + reader.ReadRow(elems); + BOOST_REQUIRE_EQUAL(elems.size(), 3); + BOOST_REQUIRE_EQUAL(elems[0], "1"); + BOOST_REQUIRE_EQUAL(elems[1], "2"); + BOOST_REQUIRE_EQUAL(elems[2], "hello"); + + reader.ReadRow(elems); + BOOST_REQUIRE_EQUAL(elems.size(), 3); + BOOST_REQUIRE_EQUAL(elems[0], "hello"); + BOOST_REQUIRE_EQUAL(elems[1], "goodbye"); + BOOST_REQUIRE_EQUAL(elems[2], "coffe"); +} + +BOOST_AUTO_TEST_CASE(ReadHeterogeneousRowTest) +{ + std::fstream f; + f.open("test.csv", std::fstream::out); + f << "1, 2.0, hello \n"; + f << "3 , 4.0, coffe \n"; + f.close(); + + io::CSVReader<> reader(3, "test.csv"); + int integer = 0; + double dValue = 0; + std::string str; + reader.ReadRow(integer, dValue, str); + BOOST_REQUIRE_EQUAL(integer, 1); + BOOST_REQUIRE_CLOSE(dValue, 2.0, 1e-5); + BOOST_REQUIRE_EQUAL(str, "hello"); + + reader.ReadRow(integer, dValue, str); + BOOST_REQUIRE_EQUAL(integer, 3); + BOOST_REQUIRE_CLOSE(dValue, 4.0, 1e-5); + BOOST_REQUIRE_EQUAL(str, "coffe"); +} + +BOOST_AUTO_TEST_SUITE_END(); From b05d0541bcb4ca4716a0935c8d983b8de195b2dd Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Tue, 6 Sep 2016 09:21:07 +0800 Subject: [PATCH 32/42] remove useless data member and adjust style --- src/mlpack/core/data/file_reader/csv_reader.hpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/mlpack/core/data/file_reader/csv_reader.hpp b/src/mlpack/core/data/file_reader/csv_reader.hpp index 5260113b3ea..21790135f62 100644 --- a/src/mlpack/core/data/file_reader/csv_reader.hpp +++ b/src/mlpack/core/data/file_reader/csv_reader.hpp @@ -58,15 +58,14 @@ class CSVReader{ CSVReader&operator=(const CSVReader&); template - explicit CSVReader(size_t column_count, Args&&...args) : - in(std::forward(args)...), - columncount(column_count), - columnNames(column_count), - colOrder(column_count), - row(column_count, nullptr) + explicit CSVReader(size_t columnCount, Args&&...args) : + in(std::forward(args)...), + columnNames(columnCount), + colOrder(columnCount), + row(columnCount, nullptr) { std::iota(std::begin(colOrder), std::end(colOrder), 0); - for(size_t i=1; i<=column_count; ++i){ + for(size_t i=1; i <= columnCount; ++i){ columnNames[i-1] = "col" + std::to_string(i); } } @@ -238,7 +237,6 @@ class CSVReader{ LineReader in; - size_t columncount; std::vector columnNames; std::vector colOrder; std::vector row; From 356a79731c5390bd5bc9f9151802832e823bbbc7 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Fri, 9 Sep 2016 00:27:53 +0800 Subject: [PATCH 33/42] add new test ParseLineTest --- src/mlpack/tests/file_reader_parser_test.cpp | 21 ++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/mlpack/tests/file_reader_parser_test.cpp b/src/mlpack/tests/file_reader_parser_test.cpp index f173dfad83c..ffd63770688 100644 --- a/src/mlpack/tests/file_reader_parser_test.cpp +++ b/src/mlpack/tests/file_reader_parser_test.cpp @@ -126,4 +126,25 @@ BOOST_AUTO_TEST_CASE(ParseStringTest) BOOST_REQUIRE_EQUAL("hh 600 mm gg", str); } +BOOST_AUTO_TEST_CASE(ParseLineTest) +{ + std::vector chars(3, nullptr); + std::vector colOrder(chars.size()); + std::iota(std::begin(colOrder), std::end(colOrder), 0); + + char strs[] = "600 ,800 ,300 "; + io::ParseLine, io::NoQuoteEscape<','>>(strs, + &chars[0], colOrder); + BOOST_REQUIRE_EQUAL("600", chars[0]); + BOOST_REQUIRE_EQUAL("800", chars[1]); + BOOST_REQUIRE_EQUAL("300", chars[2]); + + char strs2[] = "600\t800\t 300 "; + io::ParseLine, io::NoQuoteEscape<'\t'>>(strs2, + &chars[0], colOrder); + BOOST_REQUIRE_EQUAL("600", chars[0]); + BOOST_REQUIRE_EQUAL("800", chars[1]); + BOOST_REQUIRE_EQUAL("300", chars[2]); +} + BOOST_AUTO_TEST_SUITE_END(); From 64259a53e6895e829b2ebe72bb2352527fb93b0c Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Fri, 9 Sep 2016 12:06:24 +0800 Subject: [PATCH 34/42] 1 : add new policy NoQuoteEscapes, able to separate text with variable separators 2 : test NoQuoteEscapes --- src/mlpack/core/data/file_reader/policy.hpp | 32 +++++++++++++++++++- src/mlpack/tests/file_reader_parser_test.cpp | 7 +++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/mlpack/core/data/file_reader/policy.hpp b/src/mlpack/core/data/file_reader/policy.hpp index 414244949a3..018c58c966d 100644 --- a/src/mlpack/core/data/file_reader/policy.hpp +++ b/src/mlpack/core/data/file_reader/policy.hpp @@ -135,7 +135,7 @@ struct SingleAndEmptyLineComment template struct NoQuoteEscape { - static const char* FindNextColumnEnd(const char*col_begin) + static const char* FindNextColumnEnd(const char *col_begin) { while(*col_begin != sep && *col_begin != '\0') { @@ -147,7 +147,37 @@ struct NoQuoteEscape static void unescape(char*&, char*&) { + } +}; + +template +struct NoQuoteEscapes +{ + static const char* FindNextColumnEnd(const char *col_begin) + { + while(!IsEscapeChar(*col_begin, sep...) && *col_begin != '\0') + { + ++col_begin; + } + return col_begin; + } + + static void unescape(char*&, char*&) + { + } + +private: + constexpr static bool IsEscapeChar(char) + { + return false; + } + + template + constexpr static bool IsEscapeChar(char c, char escapeChar, OtherEscapeChars...otherEscapeChars) + { + return c == escapeChar || IsEscapeChar(c, otherEscapeChars...); + } }; template diff --git a/src/mlpack/tests/file_reader_parser_test.cpp b/src/mlpack/tests/file_reader_parser_test.cpp index ffd63770688..9051006f575 100644 --- a/src/mlpack/tests/file_reader_parser_test.cpp +++ b/src/mlpack/tests/file_reader_parser_test.cpp @@ -145,6 +145,13 @@ BOOST_AUTO_TEST_CASE(ParseLineTest) BOOST_REQUIRE_EQUAL("600", chars[0]); BOOST_REQUIRE_EQUAL("800", chars[1]); BOOST_REQUIRE_EQUAL("300", chars[2]); + + char strs3[] = "600\t800,300"; + io::ParseLine, io::NoQuoteEscapes<'\t',','>>(strs3, + &chars[0], colOrder); + BOOST_REQUIRE_EQUAL("600", chars[0]); + BOOST_REQUIRE_EQUAL("800", chars[1]); + BOOST_REQUIRE_EQUAL("300", chars[2]); } BOOST_AUTO_TEST_SUITE_END(); From 5a6b0c6552a94e6f5ec4020b361c31082470535f Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Fri, 9 Sep 2016 12:33:34 +0800 Subject: [PATCH 35/42] implement LoadARFF without DatasetInfo --- src/mlpack/core/data/load_arff_impl.hpp | 82 +++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 68c9184fe71..04c2223233c 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -9,12 +9,94 @@ // In case it hasn't been included yet. #include "load_arff.hpp" +#include "file_reader/csv_reader.hpp" #include namespace mlpack { namespace data { +namespace details{ + +/** + * Store the information of arff file + */ +struct ArffInfo +{ + ArffInfo() : + totalRows(0), + totalCols(0) + {} + + size_t totalRows; + size_t totalCols; + std::vector classCols;//record the column belongs to class type +}; + +ArffInfo LoadARFFInfo(const std::string &filename) +{ + using namespace boost::algorithm; + + io::LineReader reader(filename); + ArffInfo info; + std::vector chars(3, nullptr); + std::vector colOrder(chars.size()); + std::iota(std::begin(colOrder), std::end(colOrder), 0); + while(char *line = reader.NextLine()){ + if(line[0] == '@'){ + if(istarts_with(line, "@attribute")){ + io::ParseLine, io::NoQuoteEscapes<' ','\t'>>(line, + &chars[0], colOrder); + if(istarts_with(chars[2], "numeric")){ + info.classCols.emplace_back(info.totalRows); + } + ++info.totalRows; + }else if(ifind_first(line, "@data")){ + break; + } + } + } + + while(reader.NextLine()){ + ++info.totalCols; + } + + return info; +} + +} //namespace details + +template +void LoadARFF(const std::string& filename, arma::Mat& matrix) +{ + using namespace boost::algorithm; + + io::LineReader reader(filename); + while(char *line = reader.NextLine()){ + if(istarts_with(line, "@data")){ + break; + } + } + + const auto info = details::LoadARFFInfo(filename); + std::vector chars(info.totalRows, nullptr); + std::vector colOrder(chars.size()); + std::iota(std::begin(colOrder), std::end(colOrder), 0); + matrix.set_size(info.totalRows, info.totalCols); + size_t row = 0, col = 0; + + while(char *line = reader.NextLine()){ + io::ParseLine, io::NoQuoteEscape<','>>(line, + &chars[0], colOrder); + for(const auto val : chars){ + if(val){ + io::Parse(val, matrix(row++, col)); + } + } + ++col; row = 0; + } +} + template void LoadARFF(const std::string& filename, arma::Mat& matrix, From ac6f46c49322f8799960873fc70c954aa89b3155 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Fri, 9 Sep 2016 12:34:25 +0800 Subject: [PATCH 36/42] add NoMappingARFFTest --- src/mlpack/tests/load_save_test.cpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp index 4eb8f12598e..1a0c00fca9f 100644 --- a/src/mlpack/tests/load_save_test.cpp +++ b/src/mlpack/tests/load_save_test.cpp @@ -1403,6 +1403,34 @@ BOOST_AUTO_TEST_CASE(HarderKeonTest) BOOST_REQUIRE_EQUAL(ntInfo.NumMappings(3), 3); } +BOOST_AUTO_TEST_CASE(NoMappingARFFTest) +{ + std::fstream f; + f.open("test.arff", std::fstream::out); + f << "@relation test" << endl; + f << endl; + f << "@attribute one NUMERIC" << endl; + f << "@attribute two NUMERIC" << endl; + f << endl; + f << "@data" << endl; + f << "1, 2" << endl; + f << "3, 4" << endl; + f << "5, 6" << endl; + f << "7, 8" << endl; + f.close(); + + arma::mat dataset; + data::LoadARFF("test.arff", dataset); + + BOOST_REQUIRE_EQUAL(dataset.n_rows, 2); + BOOST_REQUIRE_EQUAL(dataset.n_cols, 4); + + for (size_t i = 0; i < dataset.n_elem; ++i) + BOOST_REQUIRE_CLOSE(dataset[i], double(i + 1), 1e-5); + + remove("test.arff"); +} + /** * A simple ARFF load test. Two attributes, both numeric. */ From 2e3892476b1dffec2c2eca2ff50f991dbc627442 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Fri, 9 Sep 2016 17:13:45 +0800 Subject: [PATCH 37/42] use inline to avoid symbol confliction --- src/mlpack/core/data/load_arff_impl.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index fa70f5d8ba4..2a9a3e07b74 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -33,6 +33,7 @@ struct ArffInfo std::vector classCols;//record the column belongs to class type }; +inline ArffInfo LoadARFFInfo(const std::string &filename) { using namespace boost::algorithm; From 3fafb8f72c3c27424f300433a886b8942a61bf86 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Fri, 9 Sep 2016 23:20:19 +0800 Subject: [PATCH 38/42] replace multiple spaces to one space when parsing arff --- src/mlpack/core/data/load_arff_impl.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 2a9a3e07b74..9d0d36f6e33 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -13,6 +13,8 @@ #include +#include + namespace mlpack { namespace data { @@ -45,8 +47,9 @@ ArffInfo LoadARFFInfo(const std::string &filename) std::iota(std::begin(colOrder), std::end(colOrder), 0); while(char *line = reader.NextLine()){ if(line[0] == '@'){ - if(istarts_with(line, "@attribute")){ - io::ParseLine, io::NoQuoteEscapes<' ','\t'>>(line, + if(istarts_with(line, "@attribute")){ + auto newStr = std::regex_replace(line, std::regex("[' ']{2,}"), " "); + io::ParseLine, io::NoQuoteEscapes<' ','\t'>>(&newStr[0], &chars[0], colOrder); if(istarts_with(chars[2], "numeric")){ info.classCols.emplace_back(info.totalRows); From 732de6b20f574ff4a15d7614acf70bbf94f57fb5 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Fri, 9 Sep 2016 23:21:43 +0800 Subject: [PATCH 39/42] avoid recreation of regex object --- src/mlpack/core/data/load_arff_impl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 9d0d36f6e33..1a080555e1f 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -45,10 +45,11 @@ ArffInfo LoadARFFInfo(const std::string &filename) std::vector chars(3, nullptr); std::vector colOrder(chars.size()); std::iota(std::begin(colOrder), std::end(colOrder), 0); + const auto regex = std::regex("[' ']{2,}"); while(char *line = reader.NextLine()){ if(line[0] == '@'){ if(istarts_with(line, "@attribute")){ - auto newStr = std::regex_replace(line, std::regex("[' ']{2,}"), " "); + auto newStr = std::regex_replace(line, regex, " "); io::ParseLine, io::NoQuoteEscapes<' ','\t'>>(&newStr[0], &chars[0], colOrder); if(istarts_with(chars[2], "numeric")){ From 37477ff6cd2aa649049b67c1fc890c21b18874a2 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 10 Sep 2016 00:35:30 +0800 Subject: [PATCH 40/42] boost xpressive to replace std regex since there are some bugs when using std regex --- src/mlpack/core/data/load_arff_impl.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 1a080555e1f..f59953de3dc 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -12,8 +12,7 @@ #include "file_reader/csv_reader.hpp" #include - -#include +#include namespace mlpack { namespace data { @@ -45,11 +44,11 @@ ArffInfo LoadARFFInfo(const std::string &filename) std::vector chars(3, nullptr); std::vector colOrder(chars.size()); std::iota(std::begin(colOrder), std::end(colOrder), 0); - const auto regex = std::regex("[' ']{2,}"); + const auto regex = boost::xpressive::cregex::compile("[' ']{2,}"); while(char *line = reader.NextLine()){ if(line[0] == '@'){ if(istarts_with(line, "@attribute")){ - auto newStr = std::regex_replace(line, regex, " "); + auto newStr = boost::xpressive::regex_replace(line, regex, " "); io::ParseLine, io::NoQuoteEscapes<' ','\t'>>(&newStr[0], &chars[0], colOrder); if(istarts_with(chars[2], "numeric")){ From 41a716689db449215c44ee4de6a942dc634d15d0 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 10 Sep 2016 01:39:25 +0800 Subject: [PATCH 41/42] switch back to std regex since xpressive need long compile time --- src/mlpack/core/data/load_arff_impl.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index f59953de3dc..4874efe1bc9 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -12,7 +12,8 @@ #include "file_reader/csv_reader.hpp" #include -#include + +#include namespace mlpack { namespace data { @@ -44,11 +45,11 @@ ArffInfo LoadARFFInfo(const std::string &filename) std::vector chars(3, nullptr); std::vector colOrder(chars.size()); std::iota(std::begin(colOrder), std::end(colOrder), 0); - const auto regex = boost::xpressive::cregex::compile("[' ']{2,}"); + const auto regex = std::regex("[' ']{2,}"); while(char *line = reader.NextLine()){ if(line[0] == '@'){ if(istarts_with(line, "@attribute")){ - auto newStr = boost::xpressive::regex_replace(line, regex, " "); + auto newStr = std::regex_replace(std::string(line), regex, " "); io::ParseLine, io::NoQuoteEscapes<' ','\t'>>(&newStr[0], &chars[0], colOrder); if(istarts_with(chars[2], "numeric")){ From 7252e2daa9eb5dbd57e28736b30823809e2d81f4 Mon Sep 17 00:00:00 2001 From: stereomatchingkiss Date: Sat, 10 Sep 2016 03:26:19 +0800 Subject: [PATCH 42/42] fix type mismatch --- src/mlpack/core/data/load_arff_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp index 4874efe1bc9..f79b3a19855 100644 --- a/src/mlpack/core/data/load_arff_impl.hpp +++ b/src/mlpack/core/data/load_arff_impl.hpp @@ -49,7 +49,7 @@ ArffInfo LoadARFFInfo(const std::string &filename) while(char *line = reader.NextLine()){ if(line[0] == '@'){ if(istarts_with(line, "@attribute")){ - auto newStr = std::regex_replace(std::string(line), regex, " "); + auto newStr = std::regex_replace(std::string(line), regex, std::string(" ")); io::ParseLine, io::NoQuoteEscapes<' ','\t'>>(&newStr[0], &chars[0], colOrder); if(istarts_with(chars[2], "numeric")){