-
Notifications
You must be signed in to change notification settings - Fork 537
/
unigram_tokenizer.h
64 lines (49 loc) · 1.59 KB
/
unigram_tokenizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
// Copyright (C) 2012 Massachusetts Institute of Technology, Lincoln Laboratory
// License: Boost Software License See LICENSE.txt for the full license.
// Authors: Davis E. King (davis@dlib.net)
#ifndef MIT_LL_XTECH_UNIGRAM_ToKENIZER_H_
#define MIT_LL_XTECH_UNIGRAM_ToKENIZER_H_
#include <string>
#include <iostream>
#include <fstream>
#include <mitie/conll_tokenizer.h>
namespace mitie
{
// ----------------------------------------------------------------------------------------
class unigram_tokenizer
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a tool for reading a sequence of unigrams from a file. It is just
a version of the conll_tokenizer except that it also replaces any numbers
with # characters.
!*/
public:
typedef std::string token_type;
unigram_tokenizer (
) {}
unigram_tokenizer (
std::istream& in
) : tok(in) { }
bool operator() (std::string& token)
{
const bool result = tok(token);
convert_numbers(token);
return result;
}
private:
static inline void convert_numbers (
std::string& str
)
{
for (unsigned long i = 0; i < str.size(); ++i)
{
if ('0' <= str[i] && str[i] <= '9')
str[i] = '#';
}
}
conll_tokenizer tok;
};
// ----------------------------------------------------------------------------------------
}
#endif // MIT_LL_XTECH_UNIGRAM_ToKENIZER_H_