/
ByteBasedParser.scala
104 lines (94 loc) · 3.14 KB
/
ByteBasedParser.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
package jawn
import scala.annotation.{switch, tailrec}
/**
* Trait used when the data to be parsed is in UTF-8.
*
* This parser has to translate input bytes to Chars and Strings. It
* provides a byte() method to access individual bytes, and also
* parser strings from bytes.
*
* Its parseString() implementation has two cases. In the first case
* (the hot path) the string has no escape sequences and we can just
* UTF-8 decode the entire set of bytes. In the second case, it goes
* to some trouble to be sure to de-escape correctly given that the
* input data is UTF-8.
*/
trait ByteBasedParser[J] extends Parser[J] {
protected[this] def byte(i: Int): Byte
/**
* See if the string has any escape sequences. If not, return the end of the
* string. If so, bail out and return -1.
*
* This method expects the data to be in UTF-8 and accesses it as bytes. Thus
* we can just ignore any bytes with the highest bit set.
*/
protected[this] final def parseStringSimple(i: Int, ctxt: FContext[J]): Int = {
var j = i
var c: Int = byte(j) & 0xff
while (c != 34) {
if (c < 32) die(j, "control char (%d) in string" format c)
if (c == 92) return -1
j += 1
c = byte(j) & 0xff
}
j + 1
}
/**
* Parse the string according to JSON rules, and add to the given context.
*
* This method expects the data to be in UTF-8 and accesses it as bytes.
*/
protected[this] final def parseString(i: Int, ctxt: FContext[J]): Int = {
val k = parseStringSimple(i + 1, ctxt)
if (k != -1) {
ctxt.add(at(i + 1, k - 1))
return k
}
// TODO: we might be able to do better by identifying where
// escapes occur, and then translating the intermediate strings in
// one go.
var j = i + 1
val sb = new CharBuilder
var c: Int = byte(j) & 0xff
while (c != 34) { // "
if (c == 92) { // \
(byte(j + 1): @switch) match {
case 98 => { sb.append('\b'); j += 2 }
case 102 => { sb.append('\f'); j += 2 }
case 110 => { sb.append('\n'); j += 2 }
case 114 => { sb.append('\r'); j += 2 }
case 116 => { sb.append('\t'); j += 2 }
case 34 => { sb.append('"'); j += 2 }
case 47 => { sb.append('/'); j += 2 }
case 92 => { sb.append('\\'); j += 2 }
// if there's a problem then descape will explode
case 117 => { sb.append(descape(at(j + 2, j + 6))); j += 6 }
case _ => die(j, "invalid escape sequence")
}
} else if (c < 32) {
die(j, "control char (%d) in string" format c)
} else if (c < 128) {
// 1-byte UTF-8 sequence
sb.append(c.toChar)
j += 1
} else if ((c & 224) == 192) {
// 2-byte UTF-8 sequence
sb.extend(at(j, j + 2))
j += 2
} else if ((c & 240) == 224) {
// 3-byte UTF-8 sequence
sb.extend(at(j, j + 3))
j += 3
} else if ((c & 248) == 240) {
// 4-byte UTF-8 sequence
sb.extend(at(j, j + 4))
j += 4
} else {
die(j, "invalid UTF-8 encoding")
}
c = byte(j) & 0xff
}
ctxt.add(sb.makeString)
j + 1
}
}