driver/lexer.go

package driver

import (
	"fmt"
	"io"
	"io/ioutil"
)

type ModeID int

func (id ModeID) Int() int {
	return int(id)
}

type StateID int

func (id StateID) Int() int {
	return int(id)
}

type KindID int

func (id KindID) Int() int {
	return int(id)
}

type ModeKindID int

func (id ModeKindID) Int() int {
	return int(id)
}

type LexSpec interface {
	InitialMode() ModeID
	Pop(mode ModeID, modeKind ModeKindID) bool
	Push(mode ModeID, modeKind ModeKindID) (ModeID, bool)
	ModeName(mode ModeID) string
	InitialState(mode ModeID) StateID
	NextState(mode ModeID, state StateID, v int) (StateID, bool)
	Accept(mode ModeID, state StateID) (ModeKindID, bool)
	KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string)
}

// Token representes a token.
type Token struct {
	// ModeID is an ID of a lex mode.
	ModeID ModeID

	// KindID is an ID of a kind. This is unique among all modes.
	KindID KindID

	// ModeKindID is an ID of a lexical kind. This is unique only within a mode.
	// Note that you need to use KindID field if you want to identify a kind across all modes.
	ModeKindID ModeKindID

	// Row is a row number where a lexeme appears.
	Row int

	// Col is a column number where a lexeme appears.
	// Note that Col is counted in code points, not bytes.
	Col int

	// Lexeme is a byte sequence matched a pattern of a lexical specification.
	Lexeme []byte

	// When this field is true, it means the token is the EOF token.
	EOF bool

	// When this field is true, it means the token is an error token.
	Invalid bool
}

type LexerOption func(l *Lexer) error

// DisableModeTransition disables the active mode transition. Thus, even if the lexical specification has the push and pop
// operations, the lexer doesn't perform these operations. When the lexical specification has multiple modes, and this option is
// enabled, you need to call the Lexer.Push and Lexer.Pop methods to perform the mode transition. You can use the Lexer.Mode method
// to know the current lex mode.
func DisableModeTransition() LexerOption {
	return func(l *Lexer) error {
		l.passiveModeTran = true
		return nil
	}
}

type Lexer struct {
	spec            LexSpec
	src             []byte
	srcPtr          int
	row             int
	col             int
	prevRow         int
	prevCol         int
	tokBuf          []*Token
	modeStack       []ModeID
	passiveModeTran bool
}

// NewLexer returns a new lexer.
func NewLexer(spec LexSpec, src io.Reader, opts ...LexerOption) (*Lexer, error) {
	b, err := ioutil.ReadAll(src)
	if err != nil {
		return nil, err
	}
	l := &Lexer{
		spec:   spec,
		src:    b,
		srcPtr: 0,
		row:    0,
		col:    0,
		modeStack: []ModeID{
			spec.InitialMode(),
		},
		passiveModeTran: false,
	}
	for _, opt := range opts {
		err := opt(l)
		if err != nil {
			return nil, err
		}
	}

	return l, nil
}

// Next returns a next token.
func (l *Lexer) Next() (*Token, error) {
	if len(l.tokBuf) > 0 {
		tok := l.tokBuf[0]
		l.tokBuf = l.tokBuf[1:]
		return tok, nil
	}

	tok, err := l.nextAndTransition()
	if err != nil {
		return nil, err
	}
	if !tok.Invalid {
		return tok, nil
	}
	errTok := tok
	for {
		tok, err = l.nextAndTransition()
		if err != nil {
			return nil, err
		}
		if !tok.Invalid {
			break
		}
		errTok.Lexeme = append(errTok.Lexeme, tok.Lexeme...)
	}
	l.tokBuf = append(l.tokBuf, tok)

	return errTok, nil
}

func (l *Lexer) nextAndTransition() (*Token, error) {
	tok, err := l.next()
	if err != nil {
		return nil, err
	}
	if tok.EOF || tok.Invalid {
		return tok, nil
	}
	if l.passiveModeTran {
		return tok, nil
	}
	mode := l.Mode()
	if l.spec.Pop(mode, tok.ModeKindID) {
		err := l.PopMode()
		if err != nil {
			return nil, err
		}
	}
	if mode, ok := l.spec.Push(mode, tok.ModeKindID); ok {
		l.PushMode(mode)
	}
	// The checking length of the mode stack must be at after pop and push operations because those operations can be performed
	// at the same time. When the mode stack has just one element and popped it, the mode stack will be temporarily emptied.
	// However, since a push operation may be performed immediately after it, the lexer allows the stack to be temporarily empty.
	if len(l.modeStack) == 0 {
		return nil, fmt.Errorf("a mode stack must have at least one element")
	}
	return tok, nil
}

func (l *Lexer) next() (*Token, error) {
	mode := l.Mode()
	state := l.spec.InitialState(mode)
	buf := []byte{}
	unfixedBufLen := 0
	row := l.row
	col := l.col
	var tok *Token
	for {
		v, eof := l.read()
		if eof {
			if tok != nil {
				l.unread(unfixedBufLen)
				return tok, nil
			}
			// When `buf` has unaccepted data and reads the EOF, the lexer treats the buffered data as an invalid token.
			if len(buf) > 0 {
				return &Token{
					ModeID:     mode,
					ModeKindID: 0,
					Lexeme:     buf,
					Row:        row,
					Col:        col,
					Invalid:    true,
				}, nil
			}
			return &Token{
				ModeID:     mode,
				ModeKindID: 0,
				Row:        0,
				Col:        0,
				EOF:        true,
			}, nil
		}
		buf = append(buf, v)
		unfixedBufLen++
		nextState, ok := l.spec.NextState(mode, state, int(v))
		if !ok {
			if tok != nil {
				l.unread(unfixedBufLen)
				return tok, nil
			}
			return &Token{
				ModeID:     mode,
				ModeKindID: 0,
				Lexeme:     buf,
				Row:        row,
				Col:        col,
				Invalid:    true,
			}, nil
		}
		state = nextState
		if modeKindID, ok := l.spec.Accept(mode, state); ok {
			kindID, _ := l.spec.KindIDAndName(mode, modeKindID)
			tok = &Token{
				ModeID:     mode,
				KindID:     kindID,
				ModeKindID: modeKindID,
				Lexeme:     buf,
				Row:        row,
				Col:        col,
			}
			unfixedBufLen = 0
		}
	}
}

// Mode returns the current lex mode.
func (l *Lexer) Mode() ModeID {
	return l.modeStack[len(l.modeStack)-1]
}

// PushMode adds a lex mode onto the mode stack.
func (l *Lexer) PushMode(mode ModeID) {
	l.modeStack = append(l.modeStack, mode)
}

// PopMode removes a lex mode from the top of the mode stack.
func (l *Lexer) PopMode() error {
	sLen := len(l.modeStack)
	if sLen == 0 {
		return fmt.Errorf("cannot pop a lex mode from a lex mode stack any more")
	}
	l.modeStack = l.modeStack[:sLen-1]
	return nil
}

func (l *Lexer) read() (byte, bool) {
	if l.srcPtr >= len(l.src) {
		return 0, true
	}

	b := l.src[l.srcPtr]
	l.srcPtr++

	l.prevRow = l.row
	l.prevCol = l.col

	// Count the token positions.
	// The driver treats LF as the end of lines and counts columns in code points, not bytes.
	// To count in code points, we refer to the First Byte column in the Table 3-6.
	//
	// Reference:
	// - [Table 3-6] https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > Table 3-6.  UTF-8 Bit Distribution
	if b < 128 {
		// 0x0A is LF.
		if b == 0x0A {
			l.row++
			l.col = 0
		} else {
			l.col++
		}
	} else if b>>5 == 6 || b>>4 == 14 || b>>3 == 30 {
		l.col++
	}

	return b, false
}

// We must not call this function consecutively to record the token position correctly.
func (l *Lexer) unread(n int) {
	l.srcPtr -= n

	l.row = l.prevRow
	l.col = l.prevCol
}