-
Notifications
You must be signed in to change notification settings - Fork 6
/
tokenizer.go
144 lines (131 loc) · 3.93 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
package runner
import (
"bufio"
"io"
"unicode"
"unicode/utf8"
)
const (
// MaxTokenLength represents the maximum length of a token, which is 4MB. Any
// token longer than that is treated as EOF.
MaxTokenLength = 4194304 // 4MiB
)
// TokenizerFunc is a callback that returns true if the rune is part of a
// token.
type TokenizerFunc func(rune) bool
// Token represents a token in the stream.
type Token struct {
Text string
Line, Column int
}
// TokenMismatch represents a validation error where two tokens are considered
// to be different.
type TokenMismatch struct {
Contestant *Token
Expected *Token
}
// IsNonWhitespace returns true if the rune is neither an unicode space or a Java
// whitespace character. The only characters that seem to be Java whitespace
// but not unicode whitespace are:
// U+001C FILE SEPARATOR
// U+001D GROUP SEPARATOR
// U+001E RECORD SEPARATOR
// U+001F UNIT SEPARATOR
func IsNonWhitespace(r rune) bool {
return !unicode.IsSpace(r) && !('\u001c' <= r && r <= '\u001f')
}
// IsNumeric returns true if the rune may be part of a number.
func IsNumeric(r rune) bool {
return r == '.' || r == '-' || ('0' <= r && r <= '9')
}
// Tokenizer has mostly the same functionality as bufio.Scanner, but also
// provides the line and column information of the scanned tokens.
type Tokenizer struct {
scanner *bufio.Scanner
tokenizerFunc TokenizerFunc
line, column int
}
// NewTokenizer returns a new Tokenizer that reads from r and uses the
// specified tokenizer function.
func NewTokenizer(r io.Reader, tokenizerFunc TokenizerFunc) *Tokenizer {
tokenizer := &Tokenizer{
scanner: bufio.NewScanner(r),
tokenizerFunc: tokenizerFunc,
line: -1,
column: -1,
}
tokenizer.scanner.Buffer(nil, MaxTokenLength)
tokenizer.scanner.Split(tokenizer.splitFunc())
return tokenizer
}
// Scan advances the Tokenizer to the next token, which is available through
// the Token method. It returns false when the tokenization stops, either by
// reaching the end of the input or an error. After Scan returns false, the Err
// method will return any error that ocurred during tokenization, except that
// if it was io.EOF, Err will return nil.
func (t *Tokenizer) Scan() bool {
return t.scanner.Scan()
}
// Token returns the most recent token generated by a call to Scan as a newly
// allocated Token holding the buffer as a string and position information.
func (t *Tokenizer) Token() *Token {
return &Token{
Text: t.scanner.Text(),
Line: t.line,
Column: t.column,
}
}
// Err returns the first non-EOF error that was encountered by the Tokenizer.
func (t *Tokenizer) Err() error {
return t.scanner.Err()
}
// splitFunc is a wrapper around bufio.SplitFunc that calculates line and
// column information for tokens.
func (t *Tokenizer) splitFunc() bufio.SplitFunc {
line, column := 1, 1
lastLine, lastColumn := -1, -1
return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
// Skip leading non-characters.
start := 0
for width := 0; start < len(data); start += width {
var r rune
r, width = utf8.DecodeRune(data[start:])
if t.tokenizerFunc(r) {
break
}
if r == '\n' {
line++
column = 1
} else {
column++
}
}
if lastLine == -1 {
lastLine, lastColumn = line, column
}
// Scan until first non-character, marking end of token.
for width, i := 0, start; i < len(data); i += width {
var r rune
r, width = utf8.DecodeRune(data[i:])
if r == '\n' {
line++
column = 1
} else {
column++
}
if !t.tokenizerFunc(r) {
t.line, t.column = lastLine, lastColumn
lastLine, lastColumn = -1, -1
return i + width, data[start:i], nil
}
}
// If we're at EOF, we have a final, non-empty, non-terminated token. Return it.
if atEOF && len(data) > start {
t.line, t.column = lastLine, lastColumn
lastLine, lastColumn = -1, -1
return len(data), data[start:], nil
}
// Request more data.
return start, nil, nil
}
}