/
PythonesqueTokenSource.java
228 lines (213 loc) · 7.04 KB
/
PythonesqueTokenSource.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
package com.euclideanspace.whitespaceblock;
import java.util.Stack;
import org.antlr.runtime.CommonToken;
import org.antlr.runtime.Token;
import org.eclipse.xtext.parser.antlr.AbstractSplittingTokenSource;
import org.eclipse.xtext.parser.antlr.ITokenAcceptor;
import com.euclideanspace.whitespaceblock.parser.antlr.internal.InternalDemoLexer;
/**
* Provides a token source for a language that uses whitespace to delineate
* blocks, in a similar way to the Python language.
*
* TokenSource lives between the lexer and the parser. When the
* parser calls TokenSource.nextToken() then TokenSource will either call
* Lexer.nextToken() on or insert a PhantomToken.
*
* A typical use-case is Python-like block delineation. Here
* the TokenSource will read the whitespace and insert PhantomTokens into the
* token-stream to mark the beginning and end of blocks.
*
* This makes it possible to parse code where it would be difficult or impossible
* to define a suitable grammar otherwise.
*
* Known Bugs
* ----------
* There is a known bug here: https://github.com/martinbaker/xtextadd/issues/1
*
* @author Martin Baker
*
*/
public class PythonesqueTokenSource extends AbstractSplittingTokenSource {
/** holds previous indent values */
Stack<Integer> pile = new Stack<Integer>();
/** holds current indent, that is, a count of the number of spaces or
* tabs following a new line
*/
int indent=0;
/**
* state tells us what part of the line are we currently reading:
* FIRSTLINE - when reading first line don't insert BEGIN because we don't need BEGIN-END round whole program.
* INDENT - where we count the number of spaces or tabs to update indent
* BODY - we have read a non-space since last newline so we are no longer in indent
* CONTINUATION - we have just read a continuation token so next line does not alter indent
* @author Martin Baker
*/
private enum StateValues {FIRSTLINE,INDENT,BODY,CONTINUATION}
StateValues state = StateValues.FIRSTLINE;
/**
* This holds the indent of the first non-empty line
*/
int initialIndent=0;
/**
* tell doSplitToken number of BEGINs to emit
*/
int indentIncrement=0;
/**
* tell doSplitToken number of ENDSs to emit
*/
int indentDecrement=0;
/**
* We hold thisToken and lastToken because we need lastToken in doSplitToken.
* TODO this is a hack, it would be better if AbstractSplittingTokenSource
* provided this.
*/
CommonToken thisToken = null;
/**
* We hold thisToken and lastToken because we need lastToken in doSplitToken.
* TODO this is a hack, it would be better if AbstractSplittingTokenSource
* provided this.
*/
CommonToken lastToken = null;
@Override
protected boolean shouldSplitToken(Token token) {
// if end-of-file then close any remaining blocks
if (token.getType() == Token.EOF) {
if (pile.empty()) return false;
while (!pile.empty()) {
pile.pop();
indentDecrement++;
}
return true;
}
lastToken = thisToken;
if (token instanceof CommonToken) thisToken = (CommonToken)token;
switch (state) {
case FIRSTLINE:
if (token.getType() == InternalDemoLexer.RULE_WS) {
indent=countSpaces(indent,token.getText());
return false;
} else if (token.getType() == InternalDemoLexer.RULE_SL_COMMENT) {
indent=0; // comment contains new line
return false;
} else if (token.getType() == InternalDemoLexer.RULE_LINECONTINUATION) {
state = StateValues.CONTINUATION;
return false;
} else {
state = StateValues.BODY;
initialIndent=indent;
return true;
}
case INDENT:
if (token.getType() == InternalDemoLexer.RULE_WS) {
indent=countSpaces(indent,token.getText());
return false;
} else if (token.getType() == InternalDemoLexer.RULE_SL_COMMENT) {
indent=0; // comment contains new line
return false;
} else if (token.getType() == InternalDemoLexer.RULE_LINECONTINUATION) {
state = StateValues.CONTINUATION;
return false;
} else {
state = StateValues.BODY;
int peek =initialIndent; // if pile is empty use initialIndent
if (!pile.empty()) peek=pile.peek();
if (indent == peek) return false;
if (indent < peek) {
while (!pile.empty() && indent < peek) {
pile.pop();
if (!pile.empty()) peek=pile.peek(); else peek=initialIndent;
indentDecrement++;
}
}
if (indent > peek) {
pile.push(indent);
indentIncrement++;
}
return true;
}
case BODY:
if (token.getType() == InternalDemoLexer.RULE_WS) {
int c=countSpacesAfterNewline(token.getText());
if (c<0) return false;
indent = c;
state = StateValues.INDENT;
return false;
} else if (token.getType() == InternalDemoLexer.RULE_SL_COMMENT) {
indent = 0;
state = StateValues.INDENT;
} else if (token.getType() == InternalDemoLexer.RULE_LINECONTINUATION) {
state = StateValues.CONTINUATION;
return false;
} else {
return false;
}
case CONTINUATION:
if (token.getType() == InternalDemoLexer.RULE_WS) {
int c=countSpacesAfterNewline(token.getText());
if (c<0) {
// if newline follows CONTINUATION then don't change indent on new line
state = StateValues.BODY;
}
return false;
} else if (token.getType() == InternalDemoLexer.RULE_SL_COMMENT) {
// if comment follows CONTINUATION then don't change indent on new line
state = StateValues.BODY;
return false;
} else if (token.getType() == InternalDemoLexer.RULE_LINECONTINUATION) {
// changes nothing
return false;
} else {
return false;
}
default:
return false;
}
}
@Override
protected void doSplitToken(Token token, ITokenAcceptor result) {
while (indentIncrement>0) {
result.accept(new PhantomToken(InternalDemoLexer.RULE_BEGIN,lastToken));
indentIncrement--;
}
while (indentDecrement>0) {
result.accept(new PhantomToken(InternalDemoLexer.RULE_END,lastToken));
indentDecrement--;
}
result.accept(token);
}
/**
* count the number of spaces and add to indent.
* @param currentIndent the number of spaces already read.
* @param text contains 1 or more whitespace elements.
* @return new value of indent.
*/
protected int countSpaces(int currentIndent,String text){
int result = currentIndent;
if (text == null) return result;
char[] charArray = text.toCharArray();
for (char c : charArray) {
if (c==' ') result++;
if (c=='\t') result=result+4;
if (c=='\n') result=0;
if (c=='\r') result=0;
}
return result;
}
/**
* returns -1 if no newline, otherwise number of spaces after it
* @param text contains 1 or more whitespace elements.
* @return -1 if no newline, otherwise number of spaces after it
*/
protected int countSpacesAfterNewline(String text){
int result = -1;
if (text == null) return result;
char[] charArray = text.toCharArray();
for (char c : charArray) {
if (c==' ') if (result > -1) result++;
if (c=='\t') if (result > -1) result=result+4;
if (c=='\n') result=0;
if (c=='\r') result=0;
}
return result;
}
}