This repository has been archived by the owner on Mar 9, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
SentenceTokeniser.java
187 lines (161 loc) · 6.41 KB
/
SentenceTokeniser.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
package cc.util;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.util.Function;
/**
* based heavily on edu.stanford.nlp.process.DocumentPreprocessor
*/
public class SentenceTokeniser implements Iterable<List<HasWord>> {
private Reader inputReader = null;
//Configurable options
private TokenizerFactory<? extends HasWord> tokenizerFactory = PTBTokenizer.factory();
private String[] sentenceFinalPuncWords = {".", "?", "!"};
private Function<List<HasWord>,List<HasWord>> escaper = null;
private String sentenceDelimiter = null;
//From PTB conventions
private final String[] sentenceFinalFollowers = {")", "]", "\"", "\'", "''", "-RRB-", "-RSB-", "-RCB-"};
public static void main(String[] args) throws IOException{
SentenceTokeniser sentenceTokeniser = new SentenceTokeniser();
for (String sentence : sentenceTokeniser.extractSentences( "This is a sentence. And (do you know) So is this!")) {
System.out.println("next sentence ["+sentence+"]");
}
for (String sentence : sentenceTokeniser.extractSentences( "Oh, and by the way.... guess what! so is this!")) {
System.out.println("next sentence ["+sentence+"]");
}
}
public List<String> extractSentences(String text) {
this.inputReader = new StringReader(text);
List<String> sentences = new ArrayList<String>();
for (List<HasWord> sentence : this) {
StringBuilder sentenceBuffer = new StringBuilder();
for (HasWord word : sentence) {
sentenceBuffer.append(word.word()+" ");
}
String sentenceWithoutTrailingSpace = sentenceBuffer.toString().substring(0, sentenceBuffer.length()-1);
sentences.add(sentenceWithoutTrailingSpace);
}
return sentences;
}
/**
* Returns sentences until the document is exhausted. Calls close() if the end of the document
* is reached. Otherwise, the user is required to close the stream.
*/
public Iterator<List<HasWord>> iterator() {
return new PlainTextIterator();
}
private class PlainTextIterator implements Iterator<List<HasWord>> {
private Tokenizer<? extends HasWord> tokenizer;
private Set<String> sentDelims;
private Set<String> delimFollowers = new HashSet<String>(Arrays.asList(sentenceFinalFollowers));
private Function<String, String[]> splitTag;
private List<HasWord> nextSent = null;
private List<HasWord> nextSentCarryover = new ArrayList<HasWord>();
public PlainTextIterator() {
// Establish how to find sentence boundaries
sentDelims = new HashSet<String>();
boolean eolIsSignificant = false;
if (sentenceDelimiter == null) {
if (sentenceFinalPuncWords != null) {
sentDelims = new HashSet<String>(Arrays.asList(sentenceFinalPuncWords));
}
} else {
sentDelims.add(sentenceDelimiter);
delimFollowers = new HashSet<String>();
eolIsSignificant = sentenceDelimiter.matches("\\s+");
}
// Setup the tokenizer
if(tokenizerFactory == null) {
tokenizer = WhitespaceTokenizer.
newWordWhitespaceTokenizer(inputReader, eolIsSignificant);
} else {
if(eolIsSignificant)
tokenizerFactory.setOptions("tokenizeNLs");//wsg2010: This key currently used across all tokenizers
tokenizer = tokenizerFactory.getTokenizer(inputReader);
}
}
private void primeNext() {
nextSent = new ArrayList<HasWord>(nextSentCarryover);
nextSentCarryover.clear();
boolean seenBoundary = false;
while (tokenizer.hasNext()) {
HasWord token = tokenizer.next();
if (splitTag != null) {
String[] toks = splitTag.apply(token.word());
token.setWord(toks[0]);
if(toks.length == 2 && token instanceof HasTag) {
//wsg2011: Some of the underlying tokenizers return old
//JavaNLP labels. We could convert to CoreLabel here, but
//we choose a conservative implementation....
((HasTag) token).setTag(toks[1]);
}
}
if (sentDelims.contains(token.word())) {
seenBoundary = true;
} else if (seenBoundary && !delimFollowers.contains(token.word())) {
nextSentCarryover.add(token);
break;
}
if ( ! (token.word().matches("\\s+") //||
/*token.word().equals(PTBLexer.NEWLINE_TOKEN)*/)) {
nextSent.add(token);
}
// If there are no words that can follow a sentence delimiter,
// then there are two cases. In one case is we already have a
// sentence, in which case there is no reason to look at the
// next token, since that just causes buffering without any
// chance of the current sentence being extended, since
// delimFollowers = {}. In the other case, we have an empty
// sentence, which at this point means the sentence delimiter
// was a whitespace token such as \n. We might as well keep
// going as if we had never seen anything.
if (seenBoundary && delimFollowers.size() == 0) {
if (nextSent.size() > 0) {
break;
} else {
seenBoundary = false;
}
}
}
if (nextSent.size() == 0 && nextSentCarryover.size() == 0) {
IOUtils.closeIgnoringExceptions(inputReader);
inputReader = null;
nextSent = null;
} else if (escaper != null) {
nextSent = escaper.apply(nextSent);
}
}
public boolean hasNext() {
if (nextSent == null) {
primeNext();
}
return nextSent != null;
}
public List<HasWord> next() {
if (nextSent == null) {
primeNext();
}
if (nextSent == null) {
throw new NoSuchElementException();
}
List<HasWord> thisIteration = nextSent;
nextSent = null;
return thisIteration;
}
public void remove() { throw new UnsupportedOperationException(); }
}
}