Skip to content
Permalink
Browse files

Added new token type that don't use regular expression

  • Loading branch information...
macrat committed Jan 10, 2018
1 parent 5af8a1f commit 9d4be71296ab3e683cea584a76bc0629c84472d2
Showing with 257 additions and 72 deletions.
  1. +3 −4 example_test.go
  2. +34 −40 lexer.go
  3. +5 −5 lexer_test.go
  4. +91 −18 token.go
  5. +124 −5 token_test.go
@@ -2,7 +2,6 @@ package simplexer_test

import (
"fmt"
"regexp"
"strings"

"github.com/macrat/simplexer"
@@ -90,11 +89,11 @@ func Example_addOriginalTokenType() {
input := "hello_world = \"hello world\"\nnumber = 1"
lexer := simplexer.NewLexer(strings.NewReader(input))

lexer.Whitespace = regexp.MustCompile(`^[\t ]`)
lexer.Whitespace = simplexer.NewRegexpTokenType(-1, `^[\t ]`)

lexer.TokenTypes = append([]simplexer.TokenType{
simplexer.NewTokenType(SUBSITUATION, `^=`),
simplexer.NewTokenType(NEWLINE, `^[\n\r]+`),
simplexer.NewRegexpTokenType(SUBSITUATION, `^=`),
simplexer.NewRegexpTokenType(NEWLINE, `^[\n\r]+`),
}, lexer.TokenTypes...)

fmt.Println(input)
@@ -2,26 +2,25 @@ package simplexer

import (
"io"
"regexp"
"strings"
)

// Defined default values for properties of Lexer as a package value.
var (
DefaultWhitespace = regexp.MustCompile(`^(\s|\r|\n)+`)
DefaultWhitespace = NewRegexpTokenType(-1, `(?:\s|\r|\n)+`)

DefaultTokenTypes = []TokenType{
NewTokenType(IDENT, `^[a-zA-Z_][a-zA-Z0-9_]*`),
NewTokenType(NUMBER, `^[0-9]+(\.[0-9]+)?`),
NewTokenType(STRING, `^\"([^"]*)\"`),
NewTokenType(OTHER, `^.`),
NewRegexpTokenType(IDENT, `[a-zA-Z_][a-zA-Z0-9_]*`),
NewRegexpTokenType(NUMBER, `[0-9]+(?:\.[0-9]+)?`),
NewRegexpTokenType(STRING, `\"([^"]*)\"`),
NewRegexpTokenType(OTHER, `.`),
}
)

/*
The lexical analyzer.
Whitespace is a regular expression for skipping characters like whitespaces.
Whitespace is a TokenType for skipping characters like whitespaces.
The default value is simplexer.DefaultWhitespace.
TokenTypes is an array of TokenType.
@@ -36,7 +35,7 @@ type Lexer struct {
buf string
loadedLine string
nextPos Position
Whitespace *regexp.Regexp
Whitespace TokenType
TokenTypes []TokenType
}

@@ -59,46 +58,45 @@ func (l *Lexer) readBufIfNeed() {
}
}

/*
Mathing buffer with a regular expression.
Returns submatches.
*/
func (l *Lexer) Match(re *regexp.Regexp) []string {
l.readBufIfNeed()

if m := l.Whitespace.FindString(l.buf); m != "" {
l.consumeBuffer(m)
func (l *Lexer) consumeBuffer(t *Token) {
if t == nil {
return
}

l.readBufIfNeed()
l.buf = l.buf[len(t.Literal):]

return re.FindStringSubmatch(l.buf)
}
l.nextPos = shiftPos(l.nextPos, t.Literal)

func (l *Lexer) consumeBuffer(s string) {
l.buf = l.buf[len(s):]
if idx := strings.LastIndex(t.Literal, "\n"); idx >= 0 {
l.loadedLine = t.Literal[idx+1:]
} else {
l.loadedLine += t.Literal
}
}

l.nextPos = shiftPos(l.nextPos, s)
func (l *Lexer) skipWhitespace() {
for true {
l.readBufIfNeed()

if idx := strings.LastIndex(s, "\n"); idx >= 0 {
l.loadedLine = s[idx+1:]
} else {
l.loadedLine += s
if t := l.Whitespace.FindToken(l.buf, l.nextPos); t != nil {
l.consumeBuffer(t)
} else {
break
}
}
}

func (l *Lexer) makeError() error {
for i, _ := range l.buf {
if l.Whitespace.MatchString(l.buf[i:]) {
if l.Whitespace.FindToken(l.buf[i:], l.nextPos) != nil {
return UnknownTokenError{
Literal: l.buf[:i],
Position: l.nextPos,
}
}

for _, tokenType := range l.TokenTypes {
if tokenType.Re.MatchString(l.buf[i:]) {
if tokenType.FindToken(l.buf[i:], l.nextPos) != nil {
return UnknownTokenError{
Literal: l.buf[:i],
Position: l.nextPos,
@@ -120,13 +118,11 @@ Returns nil as *Token if the buffer is empty.
*/
func (l *Lexer) Peek() (*Token, error) {
for _, tokenType := range l.TokenTypes {
if m := l.Match(tokenType.Re); len(m) > 0 {
return &Token{
Type: &tokenType,
Literal: m[0],
Submatches: m[1:],
Position: l.nextPos,
}, nil
l.skipWhitespace()

l.readBufIfNeed()
if t := tokenType.FindToken(l.buf, l.nextPos); t != nil {
return t, nil
}
}

@@ -145,9 +141,7 @@ This function using Lexer.Peek. Please read document of Peek.
func (l *Lexer) Scan() (*Token, error) {
t, e := l.Peek()

if t != nil {
l.consumeBuffer(t.Literal)
}
l.consumeBuffer(t)

return t, e
}
@@ -26,8 +26,8 @@ func execute(t *testing.T, input string, wants []want) {
t.Fatalf("excepted token type=%s literal=%#v but got nil", except.TypeID, except.Literal)
}

if token.Type.ID != except.TypeID {
t.Errorf("excepted type %s but got %s", except.TypeID, token.Type.ID)
if token.Type.GetID() != except.TypeID {
t.Errorf("excepted type %s but got %s", except.TypeID, token.Type.GetID())
}
if token.Literal != except.Literal {
t.Errorf("excepted literal %#v but got %#v", except.Literal, token.Literal)
@@ -118,7 +118,7 @@ func TestLexer_oneLine(t *testing.T) {
func TestLexer_reportingError(t *testing.T) {
lexer := simplexer.NewLexer(strings.NewReader("1 2 error 3 4"))
lexer.TokenTypes = []simplexer.TokenType{
simplexer.NewTokenType(0, `^[0-9]+`),
simplexer.NewRegexpTokenType(0, `^[0-9]+`),
}

if token, err := lexer.Scan(); err != nil {
@@ -160,7 +160,7 @@ func TestLexer_reportingError(t *testing.T) {
func TestLexer_reportingError_withoutSpace(t *testing.T) {
lexer := simplexer.NewLexer(strings.NewReader("1 2 error3 4"))
lexer.TokenTypes = []simplexer.TokenType{
simplexer.NewTokenType(0, `^[0-9]+`),
simplexer.NewRegexpTokenType(0, `^[0-9]+`),
}

if token, err := lexer.Scan(); err != nil {
@@ -202,7 +202,7 @@ func TestLexer_reportingError_withoutSpace(t *testing.T) {
func TestLexer_reportingError_atLast(t *testing.T) {
lexer := simplexer.NewLexer(strings.NewReader("12error"))
lexer.TokenTypes = []simplexer.TokenType{
simplexer.NewTokenType(0, `^[0-9]+`),
simplexer.NewRegexpTokenType(0, `^[0-9]+`),
}

if token, err := lexer.Scan(); err != nil {
109 token.go
@@ -3,6 +3,7 @@ package simplexer
import (
"regexp"
"strconv"
"strings"
)

// TokenID is Identifier for TokenType.
@@ -36,44 +37,116 @@ func (id TokenID) String() string {
}
}

// Compare TokenID as int.
func (id TokenID) Compare(another TokenID) int {
return int(id - another)
// TokenType is a rule for making Token.
type TokenType interface {
GetID() TokenID
FindToken(string, Position) *Token
}

// TokenType is a rule for making Token.
type TokenType struct {
/*
RegexpTokenType is a TokenType implement with regexp.
ID is TokenID for this token type.
Re is regular expression of token.
*/
type RegexpTokenType struct {
ID TokenID
Re *regexp.Regexp // Regular expression for taking token. Must be starts with ^.
Re *regexp.Regexp
}

/*
Make new TokenType.
Make new RegexpTokenType.
token: A TokenID of new TokenType.
id is a TokenID of new RegexpTokenType.
re: A regular expression of token. Must be starts with ^.
re is a regular expression of token.
*/
func NewTokenType(token TokenID, re string) TokenType {
return TokenType{
ID: token,
func NewRegexpTokenType(id TokenID, re string) *RegexpTokenType {
if !strings.HasPrefix(re, "^") {
re = "^" + re
}
return &RegexpTokenType{
ID: id,
Re: regexp.MustCompile(re),
}
}

// Get readable string of TokenID.
func (tt TokenType) String() string {
return tt.ID.String()
func (rtt *RegexpTokenType) String() string {
return rtt.ID.String()
}

// GetID returns id of this token type.
func (rtt *RegexpTokenType) GetID() TokenID {
return rtt.ID
}

// FindToken returns new Token if s starts with this token.
func (rtt *RegexpTokenType) FindToken(s string, p Position) *Token {
m := rtt.Re.FindStringSubmatch(s)
if len(m) > 0 {
return &Token{
Type: rtt,
Literal: m[0],
Submatches: m[1:],
Position: p,
}
}
return nil
}

/*
PatternTokenType is dictionary token type.
PatternTokenType has some strings and find token that perfect match they.
*/
type PatternTokenType struct {
ID TokenID
Patterns []string
}

/*
Make new PatternTokenType.
id is a TokenID of new PatternTokenType.
patterns is array of patterns.
*/
func NewPatternTokenType(id TokenID, patterns []string) *PatternTokenType {
return &PatternTokenType{
ID: id,
Patterns: patterns,
}
}

// Get readable string of TokenID.
func (ptt *PatternTokenType) String() string {
return ptt.ID.String()
}

// Compare TokenType of ID.
func (tt TokenType) Compare(another TokenType) int {
return tt.ID.Compare(another.ID)
// GetID returns id of token type.
func (ptt *PatternTokenType) GetID() TokenID {
return ptt.ID
}

// FindToken returns new Token if s starts with this token.
func (ptt *PatternTokenType) FindToken(s string, p Position) *Token {
for _, x := range ptt.Patterns {
if strings.HasPrefix(s, x) {
return &Token{
Type: ptt,
Literal: x,
Position: p,
}
}
}
return nil
}

// A data of found Token.
type Token struct {
Type *TokenType
Type TokenType
Literal string // The string of matched.
Submatches []string // Submatches of regular expression.
Position Position // Position of token.

0 comments on commit 9d4be71

Please sign in to comment.
You can’t perform that action at this time.