-
Notifications
You must be signed in to change notification settings - Fork 72
/
AsyncParser.scala
322 lines (290 loc) · 10.4 KB
/
AsyncParser.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
package jawn
import scala.annotation.{switch, tailrec}
import scala.math.max
import scala.collection.mutable
import scala.util.control
import java.nio.ByteBuffer
object AsyncParser {
sealed abstract class Mode(val start: Int, val value: Int)
case object UnwrapArray extends Mode(-5, 1)
case object ValueStream extends Mode(-1, 0)
case object SingleValue extends Mode(-1, -1)
def apply[J](mode: Mode = SingleValue): AsyncParser[J] =
new AsyncParser(state = mode.start, curr = 0, stack = Nil,
data = new Array[Byte](131072), len = 0, allocated = 131072,
offset = 0, done = false, streamMode = mode.value)
}
/**
* AsyncParser is able to parse chunks of data (encoded as
* Option[ByteBuffer] instances) and parse asynchronously. You can
* use the factory methods in the companion object to instantiate an
* async parser.
*
* The async parser's fields are described below:
*
* The (state, curr, stack) triple is used to save and restore parser
* state between async calls. State also helps encode extra
* information when streaming or unwrapping an array.
*
* The (data, len, allocated) triple is used to manage the underlying
* data the parser is keeping track of. As new data comes in, data may
* be expanded if not enough space is available.
*
* The offset parameter is used to drive the outer async parsing. It
* stores similar information to curr but is kept separate to avoid
* "corrupting" our snapshot.
*
* The done parameter is used internally to help figure out when the
* atEof() parser method should return true. This will be set when
* apply(None) is called.
*
* The streamMode parameter controls how the asynchronous parser will
* be handling multiple values. There are three states:
*
* 1: An array is being unwrapped. Normal JSON array rules apply
* (Note that if the outer value observed is not an array, this
* mode will toggle to the -1 mode).
*
* 0: A stream of individual JSON elements separated by whitespace
* are being parsed. We can return each complete element as we
* parse it.
*
* -1: No streaming is occuring. Only a single JSON value is
* allowed.
*/
final class AsyncParser[J] protected[jawn] (
protected[jawn] var state: Int,
protected[jawn] var curr: Int,
protected[jawn] var stack: List[FContext[J]],
protected[jawn] var data: Array[Byte],
protected[jawn] var len: Int,
protected[jawn] var allocated: Int,
protected[jawn] var offset: Int,
protected[jawn] var done: Boolean,
protected[jawn] var streamMode: Int
) extends ByteBasedParser[J] {
protected[this] var line = 0
protected[this] var pos = 0
protected[this] final def newline(i: Int) { line += 1; pos = i + 1 }
protected[this] final def column(i: Int) = i - pos
final def copy() =
new AsyncParser(state, curr, stack, data.clone, len, allocated, offset, done, streamMode)
final def absorb(buf: ByteBuffer)(implicit facade: Facade[J]): Either[ParseException, Seq[J]] = {
done = false
val buflen = buf.limit - buf.position
val need = len + buflen
resizeIfNecessary(need)
buf.get(data, len, buflen)
len = need
churn()
}
final def absorb(bytes: Array[Byte])(implicit facade: Facade[J]): Either[ParseException, Seq[J]] =
absorb(ByteBuffer.wrap(bytes))
final def absorb(s: String)(implicit facade: Facade[J]): Either[ParseException, Seq[J]] =
absorb(ByteBuffer.wrap(s.getBytes(utf8)))
final def finish()(implicit facade: Facade[J]): Either[ParseException, Seq[J]] = {
done = true
churn()
}
protected[this] final def resizeIfNecessary(need: Int): Unit = {
// if we don't have enough free space available we'll need to grow our
// data array. we never shrink the data array, assuming users will call
// feed with similarly-sized buffers.
if (need > allocated) {
val doubled = if (allocated < 0x40000000) allocated * 2 else Int.MaxValue
val newsize = max(need, doubled)
val newdata = new Array[Byte](newsize)
System.arraycopy(data, 0, newdata, 0, len)
data = newdata
allocated = newsize
}
}
/**
* Explanation of the new synthetic states. The parser machinery
* uses positive integers for states while parsing json values. We
* use these negative states to keep track of the async parser's
* status between json values.
*
* ASYNC_PRESTART: We haven't seen any non-whitespace yet. We
* could be parsing an array, or not. We are waiting for valid
* JSON.
*
* ASYNC_START: We've seen an array and have begun unwrapping
* it. We could see a ] if the array is empty, or valid JSON.
*
* ASYNC_END: We've parsed an array and seen the final ]. At this
* point we should only see whitespace or an EOF.
*
* ASYNC_POSTVAL: We just parsed a value from inside the array. We
* expect to see whitespace, a comma, or an EOF.
*
* ASYNC_PREVAL: We are in an array and we just saw a comma. We
* expect to see whitespace or a JSON value.
*/
@inline private[this] final def ASYNC_PRESTART = -5
@inline private[this] final def ASYNC_START = -4
@inline private[this] final def ASYNC_END = -3
@inline private[this] final def ASYNC_POSTVAL = -2
@inline private[this] final def ASYNC_PREVAL = -1
protected[jawn] def churn()(implicit facade: Facade[J]): Either[ParseException, Seq[J]] = {
// accumulates json values
val results = mutable.ArrayBuffer.empty[J]
// we rely on exceptions to tell us when we run out of data
try {
while (true) {
if (state < 0) {
(at(offset): @switch) match {
case '\n' =>
newline(offset)
offset += 1
case ' ' | '\t' | '\r' =>
offset += 1
case '[' =>
if (state == ASYNC_PRESTART) {
offset += 1
state = ASYNC_START
} else if (state == ASYNC_END) {
die(offset, "expected eof")
} else if (state == ASYNC_POSTVAL) {
die(offset, "expected , or ]")
} else {
state = 0
}
case ',' =>
if (state == ASYNC_POSTVAL) {
offset += 1
state = ASYNC_PREVAL
} else if (state == ASYNC_END) {
die(offset, "expected eof")
} else {
die(offset, "expected json value")
}
case ']' =>
if (state == ASYNC_POSTVAL || state == ASYNC_START) {
if (streamMode > 0) {
offset += 1
state = ASYNC_END
} else {
die(offset, "expected json value or eof")
}
} else if (state == ASYNC_END) {
die(offset, "expected eof")
} else {
die(offset, "expected json value")
}
case c =>
if (state == ASYNC_END) {
die(offset, "expected eof")
} else if (state == ASYNC_POSTVAL) {
die(offset, "expected ] or ,")
} else {
if (state == ASYNC_PRESTART && streamMode > 0) streamMode = -1
state = 0
}
}
} else {
// jump straight back into rparse
offset = reset(offset)
val (value, j) = if (state <= 0) {
parse(offset)
} else {
rparse(state, curr, stack)
}
if (streamMode > 0) {
state = ASYNC_POSTVAL
} else if (streamMode == 0) {
state = ASYNC_PREVAL
} else {
state = ASYNC_END
}
curr = j
offset = j
stack = Nil
results.append(value)
}
}
Right(results)
} catch {
case e: AsyncException =>
if (done) {
// if we are done, make sure we ended at a good stopping point
if (state == ASYNC_PREVAL || state == ASYNC_END) Right(results)
else Left(ParseException("exhausted input", -1, -1, -1))
} else {
// we ran out of data, so return what we have so far
Right(results)
}
case e: ParseException =>
// we hit a parser error, so return that error and results so far
Left(e)
}
}
// every 1M we shift our array back by 1M.
protected[this] final def reset(i: Int): Int = {
if (offset >= 1048576) {
len -= 1048576
offset -= 1048576
pos -= 1048576
System.arraycopy(data, 1048576, data, 0, len)
i - 1048576
} else {
i
}
}
/**
* We use this to keep track of the last recoverable place we've
* seen. If we hit an AsyncException, we can later resume from this
* point.
*
* This method is called during every loop of rparse, and the
* arguments are the exact arguments we can pass to rparse to
* continue where we left off.
*/
protected[this] final def checkpoint(state: Int, i: Int, stack: List[FContext[J]]) {
this.state = state
this.curr = i
this.stack = stack
}
/**
* This is a specialized accessor for the case where our underlying data are
* bytes not chars.
*/
protected[this] final def byte(i: Int): Byte = if (i >= len)
throw new AsyncException
else
data(i)
// we need to signal if we got out-of-bounds
protected[this] final def at(i: Int): Char = if (i >= len)
throw new AsyncException
else
data(i).toChar
/**
* Access a byte range as a string.
*
* Since the underlying data are UTF-8 encoded, i and k must occur on unicode
* boundaries. Also, the resulting String is not guaranteed to have length
* (k - i).
*/
protected[this] final def at(i: Int, k: Int): String = {
if (k > len) throw new AsyncException
val size = k - i
val arr = new Array[Byte](size)
System.arraycopy(data, i, arr, 0, size)
new String(arr, utf8)
}
// the basic idea is that we don't signal EOF until done is true, which means
// the client explicitly send us an EOF.
protected[this] final def atEof(i: Int) = if (done) i >= len else false
// we don't have to do anything special on close.
protected[this] final def close() = ()
}
/**
* This class is used internally by AsyncParser to signal that we've
* reached the end of the particular input we were given.
*/
private[jawn] class AsyncException extends Exception with control.NoStackTrace
/**
* This is a more prosaic exception which indicates that we've hit a
* parsing error.
*/
private[jawn] class FailureException extends Exception