forked from npshub/mantid
-
Notifications
You must be signed in to change notification settings - Fork 0
/
StringTokenizer.cpp
149 lines (132 loc) · 5.92 KB
/
StringTokenizer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
// Mantid Repository : https://github.com/mantidproject/mantid
//
// Copyright © 2018 ISIS Rutherford Appleton Laboratory UKRI,
// NScD Oak Ridge National Laboratory, European Spallation Source,
// Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS
// SPDX - License - Identifier: GPL - 3.0 +
#include "MantidKernel/StringTokenizer.h"
#include <algorithm>
#include <iterator> //cbegin,cend
#include <stdexcept>
namespace {
// implement our own trim function to avoid the locale overhead in boost::trim.
// trim from start
void trimTokenFromStart(std::string &s) { s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), ::isspace)); }
// trim from end
void trimTokenFromEnd(std::string &s) { s.erase(std::find_if_not(s.rbegin(), s.rend(), ::isspace).base(), s.end()); }
// trim from both ends
void trimToken(std::string &s) {
trimTokenFromStart(s);
trimTokenFromEnd(s);
}
// If the final character is a separator, we need to add an empty string to
// tokens.
void addEmptyFinalToken(const std::string &str, const std::string &delims, std::vector<std::string> &tokens) {
const auto pos = std::find(delims.cbegin(), delims.cend(), str.back());
if (pos != delims.cend()) {
tokens.emplace_back();
}
}
// generic tokenizer using std::find_first_of modelled after
// http://tcbrindle.github.io/posts/a-quicker-study-on-tokenising/
// MIT licensed.
template <class InputIt, class ForwardIt, class BinOp>
void for_each_token(InputIt first, InputIt last, ForwardIt s_first, ForwardIt s_last, BinOp binary_op) {
while (first != last) {
const auto pos = std::find_first_of(first, last, s_first, s_last);
binary_op(first, pos);
if (pos == last)
break;
first = std::next(pos);
}
}
void splitKeepingWhitespaceEmptyTokens(const std::string &str, const std::string &delims,
std::vector<std::string> &output) {
output.clear();
for_each_token(str.cbegin(), str.cend(), delims.cbegin(), delims.cend(),
[&output](std::string::const_iterator first, std::string::const_iterator second) {
output.emplace_back(first, second);
});
}
void splitKeepingWhitespaceIgnoringEmptyTokens(const std::string &str, const std::string &delims,
std::vector<std::string> &output) {
output.clear();
for_each_token(str.cbegin(), str.cend(), delims.cbegin(), delims.cend(),
[&output](std::string::const_iterator first, std::string::const_iterator second) {
if (first != second)
output.emplace_back(first, second);
});
}
void splitIgnoringWhitespaceKeepingEmptyTokens(const std::string &str, const std::string &delims,
std::vector<std::string> &output) {
output.clear();
for_each_token(str.cbegin(), str.cend(), delims.cbegin(), delims.cend(),
[&output](std::string::const_iterator first, std::string::const_iterator second) {
output.emplace_back(first, second);
trimToken(output.back());
});
}
void splitIgnoringWhitespaceEmptyTokens(const std::string &str, const std::string &delims,
std::vector<std::string> &output) {
output.clear();
for_each_token(str.cbegin(), str.cend(), delims.cbegin(), delims.cend(),
[&output](std::string::const_iterator first, std::string::const_iterator second) {
if (first != second) {
output.emplace_back(first, second);
trimToken(output.back());
if (output.back().empty())
output.pop_back();
}
});
}
} // namespace
/**
* Constructor requiring a string to tokenize and a string of separators.
* @param str Input string to be separated into tokens.
* @param separators List of characters used to separate the input string.
* @param options tokenizer settings. The number can be found using the
* StringTokenizer::Options enum
* @throw Throws std::runtime_error if options > 7.
* @return a const reference to the index'th token.
*/
Mantid::Kernel::StringTokenizer::StringTokenizer(const std::string &str, const std::string &separators,
unsigned options) {
// if str is empty, then there is no work to do. exit early.
if (str.empty())
return;
// see comments above for the different options split0,split1,split2 and
// split3 implement.
// cases 0-3 will check for a separator in the last place and insert an empty
// token at the end.
// cases 4-7 will not check and ignore a potential empty token at the end.
switch (options) {
case 0:
splitKeepingWhitespaceEmptyTokens(str, separators, m_tokens);
addEmptyFinalToken(str, separators, m_tokens);
return;
case TOK_IGNORE_EMPTY:
splitKeepingWhitespaceIgnoringEmptyTokens(str, separators, m_tokens);
return;
case TOK_TRIM:
splitIgnoringWhitespaceKeepingEmptyTokens(str, separators, m_tokens);
addEmptyFinalToken(str, separators, m_tokens);
return;
case (TOK_TRIM | TOK_IGNORE_EMPTY):
splitIgnoringWhitespaceEmptyTokens(str, separators, m_tokens);
return;
case TOK_IGNORE_FINAL_EMPTY_TOKEN:
splitKeepingWhitespaceEmptyTokens(str, separators, m_tokens);
return;
case (TOK_IGNORE_FINAL_EMPTY_TOKEN | TOK_IGNORE_EMPTY):
splitKeepingWhitespaceIgnoringEmptyTokens(str, separators, m_tokens);
return;
case (TOK_IGNORE_FINAL_EMPTY_TOKEN | TOK_TRIM):
splitIgnoringWhitespaceKeepingEmptyTokens(str, separators, m_tokens);
return;
case (TOK_IGNORE_FINAL_EMPTY_TOKEN | TOK_TRIM | TOK_IGNORE_EMPTY):
splitIgnoringWhitespaceEmptyTokens(str, separators, m_tokens);
return;
}
// This point is reached only if options > 7.
throw std::runtime_error("Invalid option passed to Mantid::Kernel::StringTokenizer:" + std::to_string(options));
}