-
Notifications
You must be signed in to change notification settings - Fork 1
/
lexer.go
687 lines (622 loc) · 17.1 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
package parse
import (
"bufio"
"errors"
"fmt"
"io"
"strconv"
"strings"
"unicode"
"github.com/omniskop/vitrum/vit"
"github.com/omniskop/vitrum/vit/vpath"
)
// LexError contains additional information about the error that occurred
type LexError struct {
pos vit.Position
msg string
}
func (e LexError) Error() string {
return fmt.Sprintf("%v: %s", e.pos, e.msg)
}
func (e LexError) Is(subject error) bool {
_, ok := subject.(LexError)
return ok
}
func unexpectedEOF(pos vit.Position) LexError {
return LexError{
pos: pos,
msg: "unexpected end of file",
}
}
type ReadError struct {
err error
}
func (e ReadError) Error() string {
return fmt.Sprintf("read error: %v", e.err)
}
func (e ReadError) Is(subject error) bool {
_, ok := subject.(ReadError)
return ok
}
func (e ReadError) Unwrap() error {
return e.err
}
func LexAll(input io.Reader, filePath vpath.Path) ([]token, error) {
l := NewLexer(input, filePath)
allTokens := make([]token, 0, 100) // we will just start with some capacity
for {
t, err := l.Lex()
if err != nil {
return allTokens, err
}
allTokens = append(allTokens, t)
if t.tokenType == tokenEOF {
break
}
}
return allTokens, nil
}
type lexer struct {
source *bufio.Reader
filePath vpath.Path
pos vit.Position // position of the rune that will be read next
previousPosition vit.Position
expressionFollowing bool // weather the next scanned part should be an expression
}
func NewLexer(input io.Reader, filePath vpath.Path) *lexer {
return &lexer{
source: bufio.NewReader(input),
filePath: filePath,
pos: vit.Position{
FilePath: filePath,
Line: 1,
Column: 1,
},
}
}
// NewLexerAtPosition returns a new lexer that will already start at the given position.
// This can be used to improve error messages if a lexer only parses a small portion of a bigger file.
func NewLexerAtPosition(input io.Reader, position vit.Position) *lexer {
return &lexer{
source: bufio.NewReader(input),
filePath: position.FilePath,
pos: position,
}
}
// nextRune returns the next rune from the source as well as the vit.Position of that rune in the file.
// The error returned will always be of type ReadError.
func (l *lexer) nextRune() (rune, vit.Position, error) {
r, _, err := l.source.ReadRune()
if err != nil {
return r, l.pos, ReadError{err}
}
p := l.pos
l.previousPosition = l.pos
// TODO: check how this handles line breaks with a carriage return
if r == '\n' {
l.pos.Line++
l.pos.Column = 1
} else {
l.pos.Column++
}
return r, p, nil
}
// unreadRune adds the last read rune back to the buffer.
// Calling this more than once might result in incorrect vit.Positions.
func (l *lexer) unreadRune() {
l.source.UnreadRune()
l.pos = l.previousPosition
l.previousPosition.Column-- // just an estimate, we might be missing a line break
}
// Lex returns the next token from the source.
// The error returned will either be of type LexError if the source file contained an error or of type ReadError if the source broke.
func (l *lexer) Lex() (token, error) {
if l.expressionFollowing {
t, err := l.scanExpression()
if err != nil {
return t, err
}
l.expressionFollowing = false
if t.tokenType == tokenExpression && len(t.literal) == 0 {
return t, LexError{pos: t.position.Start(), msg: "empty expression"}
}
return t, nil
}
for {
r, pos, err := l.nextRune()
if err != nil {
if errors.Is(err, io.EOF) {
return token{
tokenType: tokenEOF,
position: vit.NewRangeFromPosition(pos),
}, nil
}
return token{}, err
}
switch {
case unicode.IsSpace(r) && r != '\n':
continue
case r == '"' || r == '\'' || r == '`':
return l.scanString(r)
case r == '(':
return token{
tokenType: tokenLeftParenthesis,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == ')':
return token{
tokenType: tokenRightParenthesis,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == '{':
return token{
tokenType: tokenLeftBrace,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == '}':
return token{
tokenType: tokenRightBrace,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == ':':
l.expressionFollowing = true
return token{
tokenType: tokenColon,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == '.':
return token{
tokenType: tokenPeriod,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == '[':
return token{
tokenType: tokenLeftBracket,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == ']':
return token{
tokenType: tokenRightBracket,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == '<':
return token{
tokenType: tokenLess,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == '>':
return token{
tokenType: tokenGreater,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == '=':
return token{
tokenType: tokenAssignment,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == ',':
return token{
tokenType: tokenComma,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == ';':
return token{
tokenType: tokenSemicolon,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == '\n':
return token{
tokenType: tokenNewline,
position: vit.NewRangeFromPosition(pos),
}, nil
case r == '/':
r, _, err := l.nextRune()
if err != nil {
if errors.Is(err, io.EOF) {
return token{}, LexError{l.pos, fmt.Sprintf("unexpected symbol %q", string(r))}
}
return token{}, err
}
if r == '/' || r == '*' {
_, err = l.readComment(r == '*')
if err != nil {
return token{}, err
}
} else {
l.unreadRune()
return token{}, LexError{l.pos, fmt.Sprintf("unexpected symbol %q", string(r))}
}
continue
case unicode.IsLetter(r) || r == '#':
return l.scanIdentifier(r)
case unicode.IsNumber(r), r == '-', r == '+':
l.unreadRune()
return l.scanNumber()
default:
return token{}, LexError{l.pos, fmt.Sprintf("unexpected symbol %q", string(r))}
}
}
}
// scanExpression will scan the input for an expression.
// Expressions won't be dissected in detail because it is potentially JavaScript code that will be handled separately.
func (l *lexer) scanExpression() (token, error) {
// we ignore spaces and tabs at the start
_, err := l.skip(' ', '\t')
if err != nil {
return token{}, err
}
t := token{
tokenType: tokenExpression,
position: vit.NewRangeFromPosition(l.pos),
}
str, pos, err := l.readExpressionUntil('\n', ';', '}')
if err != nil {
return t, err
}
t.literal = str // theoretically we could trim spaces at the end here, but we would need to change a lot of tests
t.position.SetEnd(pos)
return t, nil
}
// readString will read a JavaScript string from the input.
// It should be called after the first quote (which must be passed as an argument) has been read.
func (l *lexer) readString(delimiter rune) (string, vit.Position, error) {
var out strings.Builder
var lastRune rune
for {
r, pos, err := l.nextRune()
if err != nil {
return "", pos, err
}
out.WriteRune(r)
if lastRune == '\\' { // if this rune was escaped
goto continueLoop
} else if r == delimiter {
return out.String(), pos, nil
} else if delimiter == '`' && lastRune == '$' && r == '{' { // only important if this is a raw string
str, pos, err := l.readExpressionUntil('}')
if err != nil {
return "", pos, err
}
out.WriteString(str)
}
continueLoop:
lastRune = r
}
}
func (l *lexer) readExpressionUntil(stopRunes ...rune) (string, vit.Position, error) {
var out strings.Builder
var bracketTypeOpening rune // type of the outer most bracket we are tracking. It will contain the rune that opened the bracket. (one of '(', '[' or '{' )
var bracketTypeClosing rune // type of the outer most bracket we are tracking. It will contain the rune that is expected to close the bracket. (one of ')', ']' or '}' )
var openBrackets int // number brackets (which one is specified in bracketType) that have been opened
var potentialComment bool // set to true if we encountered a '/' in the last rune
for {
r, pos, err := l.nextRune()
if err != nil {
if errors.Is(err, io.EOF) {
if openBrackets > 0 {
// theoretically we could return an error here, but we will just take the expression as is
// return "", pos, LexError{pos, fmt.Sprintf("unexpected end of file, expected '%s'", string(bracketType))}
return out.String(), l.previousPosition, nil
} else {
return out.String(), l.previousPosition, nil
}
}
return "", pos, err
}
// check if we reached the end of the expression
if openBrackets == 0 && containsRune(stopRunes, r) {
l.unreadRune()
return out.String(), l.previousPosition, nil
}
out.WriteRune(r)
// check if a string starts here ...
if r == '\'' || r == '"' || r == '`' {
// ... and if so read all of it
str, pos, err := l.readString(r)
if err != nil {
return "", pos, err
}
out.WriteString(str)
continue
}
// check if this might be the start of a comment
if potentialComment {
if r == '/' || r == '*' {
// a line comment started
str, err := l.readComment(r == '*')
if err != nil {
return "", pos, err
}
out.WriteString(str)
continue
} else {
potentialComment = false
}
} else if r == '/' {
potentialComment = true
continue
}
// if we are waiting for a bracket to close...
if openBrackets > 0 {
if bracketTypeOpening == r {
// ... and this opens a new bracket of the same type
// increase the number of open brackets
openBrackets++
} else if bracketTypeClosing == r {
// ... and this closes a bracket of the same type
// mark it as closed ...
openBrackets--
// ... and check if it was the last one we expected
if openBrackets == 0 {
bracketTypeClosing = 0
}
}
} else {
// we are currently not inside of a bracket so we need to check if a new one starts here
switch r {
case '(':
bracketTypeOpening = '('
bracketTypeClosing = ')'
openBrackets++
case '[':
bracketTypeOpening = '['
bracketTypeClosing = ']'
openBrackets++
case '{':
bracketTypeOpening = '{'
bracketTypeClosing = '}'
openBrackets++
}
}
}
}
func containsRune(set []rune, r rune) bool {
for _, s := range set {
if s == r {
return true
}
}
return false
}
// skipSpaceLikes skips all spaces and newlines. It returns the vit.Position of the first non-space character.
// That rune will still be in the source as unreadRune will be called at the end.
func (l *lexer) skipSpaceLikes() (vit.Position, error) {
for {
r, pos, err := l.nextRune()
if err != nil {
if errors.Is(err, io.EOF) {
return pos, nil
}
return pos, err
}
if !unicode.IsSpace(r) { // also matches newlines
l.unreadRune()
return pos, nil
}
}
}
// skip skips all provided runes. It returns the vit.Position of the first non-skipped rune.
// That rune will still be in the source as unreadRune will be called at the end.
func (l *lexer) skip(runes ...rune) (vit.Position, error) {
for {
r, pos, err := l.nextRune()
if err != nil {
if errors.Is(err, io.EOF) {
return pos, nil
}
return pos, err
}
if !containsRune(runes, r) {
l.unreadRune()
return pos, nil
}
}
}
// scanString scans a string token that has been started with the given quotation mark. The literal will not contain the quotation marks.
// The returned error will either be of type LexError or ReadError.
func (l *lexer) scanString(quotationMark rune) (token, error) {
t := token{
tokenType: tokenString,
position: vit.NewRangeFromPosition(l.pos),
}
// NOTE: new lines are allowed inside strings
var escaped bool
var str strings.Builder
for {
r, pos, err := l.nextRune()
if err != nil {
if errors.Is(err, io.EOF) {
return t, LexError{pos, fmt.Sprintf("unexpected end of file")}
}
return t, err
}
if escaped {
escaped = false
str.WriteRune(r)
continue
}
switch r {
case quotationMark:
t.position.SetEnd(pos)
t.literal = str.String()
return t, nil
case '\\':
escaped = true
}
str.WriteRune(r)
}
}
// scanIdentifier scans an identifier token that starts with the given first rune.
// The returned error will be of type ReadError.
func (l *lexer) scanIdentifier(first rune) (token, error) {
t := token{
tokenType: tokenIdentifier,
position: vit.NewRangeFromPosition(l.pos),
}
t.position.StartColumn--
previousPosition := t.position.Start()
var str strings.Builder
str.WriteRune(first)
for {
r, pos, err := l.nextRune()
if err != nil {
if errors.Is(err, io.EOF) {
t.position.SetEnd(previousPosition)
t.literal = str.String()
return t, nil
}
return t, err
}
if validLiteralRune(r, str.Len() == 0) {
str.WriteRune(r)
} else {
t.position.SetEnd(previousPosition)
t.literal = str.String()
l.unreadRune()
return t, nil
}
previousPosition = pos
}
}
// readComment reads all runes until the end of the comment.
// It returns the read comment excluding the runes that started it (as it will be called after the comment has already started).
// For multiline comments the output WILL include the '*/' and the end of the comment.
// For single line comments the output WILL NOT include the newline that ended the comment.
func (l *lexer) readComment(multiLineComment bool) (string, error) {
var out strings.Builder
var starFound bool // only required for multiline comments
for {
r, pos, err := l.nextRune()
if err != nil {
if errors.Is(err, io.EOF) {
if multiLineComment {
return "", unexpectedEOF(pos)
}
// a singe line comment can be the last thing in a file
return out.String(), nil
}
return "", err
}
if multiLineComment {
out.WriteRune(r)
if r == '*' {
starFound = true
} else if starFound && r == '/' {
break // reached end of multiline comment
} else {
starFound = false
}
} else {
if r == '\n' {
// we have reached the end of the line
l.unreadRune()
break
}
out.WriteRune(r) // only add the rune after we checked if it's a newline
}
}
return out.String(), nil
}
// scanNumber scans a number token.
// The returned error will either be of type LexError or ReadError.
func (l *lexer) scanNumber() (token, error) {
t := token{
tokenType: tokenFloat,
position: vit.NewRangeFromPosition(l.pos),
}
// read in all runes
var previousPosition = l.pos
var str strings.Builder
var isFloatingPoint bool
var invalid bool
var numberBase int = 10
for {
r, pos, err := l.nextRune()
if err != nil {
if errors.Is(err, io.EOF) {
goto parseAndReturn
}
return token{}, err
}
switch {
case r == '.':
if isFloatingPoint {
invalid = true // more than one floating point but we will continue to scan
} else {
isFloatingPoint = true
}
case r == '+', r == '-':
// a sign that is not at the start ends this number
if str.Len() != 0 {
l.unreadRune()
goto parseAndReturn
}
case (r == 'x' || r == 'X') && numberBase == 10:
if str.String() == "0" {
numberBase = 16
} else {
l.unreadRune()
goto parseAndReturn
}
case (r == 'b' || r == 'B') && numberBase == 10:
if str.String() == "0" {
numberBase = 2
} else {
l.unreadRune()
goto parseAndReturn
}
case unicode.IsNumber(r):
case numberBase == 16 && ((r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')):
// hexadecimal number
default:
// the number has ended
l.unreadRune()
goto parseAndReturn
}
str.WriteRune(r)
previousPosition = pos
}
// now parse the read number
parseAndReturn:
t.position.SetEnd(previousPosition)
stringN := str.String()
// catch some exceptions
if len(stringN) == 0 || stringN == "." || stringN == "+" || stringN == "-" {
return t, LexError{t.position.Start(), fmt.Sprintf("number %q is incomplete", stringN)}
}
t.literal = stringN
if invalid {
t.tokenType = tokenIdentifier
return t, nil
}
if isFloatingPoint {
if numberBase != 10 {
return t, LexError{t.position.Start(), fmt.Sprintf("floating point number %q must be decimal", stringN)}
}
_, err := strconv.ParseFloat(stringN, 64)
if err != nil {
return t, LexError{t.position.Start(), fmt.Sprintf("%q is not a valid number: %v", stringN, err)}
}
t.tokenType = tokenFloat
} else {
stringToParse := stringN
if numberBase != 10 {
stringToParse = stringN[2:]
if len(stringToParse) == 0 {
return t, LexError{t.position.Start(), fmt.Sprintf("number %q is incomplete", stringN)}
}
}
_, err := strconv.ParseInt(stringToParse, numberBase, 64)
if err != nil {
return t, LexError{t.position.Start(), fmt.Sprintf("%q is not a valid number: %v", stringN, err)}
}
t.tokenType = tokenInteger
}
return t, nil
}
// validLiteralRune checks if the rune can be used in a literal
// 'first' signals that this rune would be the first of the literal which has stricter requirements
func validLiteralRune(r rune, first bool) bool {
// a literal cannot start with a '-' or a number but they can contain it
return unicode.IsLetter(r) || r == '_' || (!first && (r == '-' || unicode.IsNumber(r)))
}