runner/tokenizer.go

package runner

import (
	"bufio"
	"io"
	"unicode"
	"unicode/utf8"
)

const (
	// MaxTokenLength represents the maximum length of a token, which is 4MB. Any
	// token longer than that is treated as EOF.
	MaxTokenLength = 4194304 // 4MiB
)

// TokenizerFunc is a callback that returns true if the rune is part of a
// token.
type TokenizerFunc func(rune) bool

// Token represents a token in the stream.
type Token struct {
	Text         string
	Line, Column int
}

// TokenMismatch represents a validation error where two tokens are considered
// to be different.
type TokenMismatch struct {
	Contestant *Token
	Expected   *Token
}

// IsNonWhitespace returns true if the rune is neither an unicode space or a Java
// whitespace character. The only characters that seem to be Java whitespace
// but not unicode whitespace are:
// U+001C FILE SEPARATOR
// U+001D GROUP SEPARATOR
// U+001E RECORD SEPARATOR
// U+001F UNIT SEPARATOR
func IsNonWhitespace(r rune) bool {
	return !unicode.IsSpace(r) && !('\u001c' <= r && r <= '\u001f')
}

// IsNumeric returns true if the rune may be part of a number.
func IsNumeric(r rune) bool {
	return r == '.' || r == '-' || ('0' <= r && r <= '9')
}

// Tokenizer has mostly the same functionality as bufio.Scanner, but also
// provides the line and column information of the scanned tokens.
type Tokenizer struct {
	scanner       *bufio.Scanner
	tokenizerFunc TokenizerFunc
	line, column  int
}

// NewTokenizer returns a new Tokenizer that reads from r and uses the
// specified tokenizer function.
func NewTokenizer(r io.Reader, tokenizerFunc TokenizerFunc) *Tokenizer {
	tokenizer := &Tokenizer{
		scanner:       bufio.NewScanner(r),
		tokenizerFunc: tokenizerFunc,
		line:          -1,
		column:        -1,
	}
	tokenizer.scanner.Buffer(nil, MaxTokenLength)
	tokenizer.scanner.Split(tokenizer.splitFunc())
	return tokenizer
}

// Scan advances the Tokenizer to the next token, which is available through
// the Token method. It returns false when the tokenization stops, either by
// reaching the end of the input or an error. After Scan returns false, the Err
// method will return any error that ocurred during tokenization, except that
// if it was io.EOF, Err will return nil.
func (t *Tokenizer) Scan() bool {
	return t.scanner.Scan()
}

// Token returns the most recent token generated by a call to Scan as a newly
// allocated Token holding the buffer as a string and position information.
func (t *Tokenizer) Token() *Token {
	return &Token{
		Text:   t.scanner.Text(),
		Line:   t.line,
		Column: t.column,
	}
}

// Err returns the first non-EOF error that was encountered by the Tokenizer.
func (t *Tokenizer) Err() error {
	return t.scanner.Err()
}

// splitFunc is a wrapper around bufio.SplitFunc that calculates line and
// column information for tokens.
func (t *Tokenizer) splitFunc() bufio.SplitFunc {
	line, column := 1, 1
	lastLine, lastColumn := -1, -1
	return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
		// Skip leading non-characters.
		start := 0
		for width := 0; start < len(data); start += width {
			var r rune
			r, width = utf8.DecodeRune(data[start:])
			if t.tokenizerFunc(r) {
				break
			}
			if r == '\n' {
				line++
				column = 1
			} else {
				column++
			}
		}
		if lastLine == -1 {
			lastLine, lastColumn = line, column
		}
		// Scan until first non-character, marking end of token.
		for width, i := 0, start; i < len(data); i += width {
			var r rune
			r, width = utf8.DecodeRune(data[i:])
			if r == '\n' {
				line++
				column = 1
			} else {
				column++
			}
			if !t.tokenizerFunc(r) {
				t.line, t.column = lastLine, lastColumn
				lastLine, lastColumn = -1, -1
				return i + width, data[start:i], nil
			}
		}
		// If we're at EOF, we have a final, non-empty, non-terminated token. Return it.
		if atEOF && len(data) > start {
			t.line, t.column = lastLine, lastColumn
			lastLine, lastColumn = -1, -1
			return len(data), data[start:], nil
		}
		// Request more data.
		return start, nil, nil
	}
}