examples/uc: Add uc example to show the interaction between terms and…

… genlex.
mewspring · Feb 13, 2017 · 476be9c · 476be9c
1 parent 41a3902
commit 476be9c
Show file tree

Hide file tree

Showing 6 changed files with 741 additions and 0 deletions.
diff --git a/examples/uc/Makefile b/examples/uc/Makefile
@@ -0,0 +1,15 @@
+all: lexer
+
+# uc.ebnf -> uc.json
+%.json: %.ebnf
+	terms -indent -start File -skip "whitespace,comment" -o $@ $<
+
+# uc.json -> lexer and token packages
+lexer: uc.json
+	genlex $<
+
+clean:
+	rm -rf token lexer
+	rm -f uc.json
+
+.PHONY: all clean
diff --git a/examples/uc/lexer/lexer.go b/examples/uc/lexer/lexer.go
@@ -0,0 +1,135 @@
+// generated by speak; DO NOT EDIT.
+
+// Package lexer implements lexical analysis of the source language.
+package lexer
+
+import (
+	"io"
+	"io/ioutil"
+	"regexp"
+
+	"github.com/mewmew/speak/examples/uc/token"
+	"github.com/pkg/errors"
+)
+
+// regstr specifies a regular expression for identifying the tokens of the input
+// grammar.
+const regstr = `^(('(?:\\n|a)')|([A-Z_a-z][0-9A-Z_a-z]*)|([0-9][0-9]*)|(!)|(!=)|(&&)|(\()|(\))|(\*)|(\+)|(,)|(-)|(/)|(;)|(<)|(<=)|(=)|(==)|(>)|(>=)|(\[)|(\])|(else)|(if)|(return)|(typedef)|(while)|(\{)|(\})|(//(?-s:.)*\n|#(?-s:.)*\n|/\*[^\*]*\*/)|([\t-\r ]))`
+
+// reg is a compiled version of regstr with leftmost-longest matching enabled.
+var reg *regexp.Regexp
+
+func init() {
+	// Compile regexp for identifying tokens and enforce leftmost-longest
+	// matching.
+	reg = regexp.MustCompile(regstr)
+	reg.Longest()
+}
+
+// A Lexer lexes the source input into a slice of tokens.
+type Lexer struct {
+	// Source input.
+	input []byte
+	// Current position in the source input.
+	pos int
+}
+
+// New returns a new scanner lexing from r.
+func New(r io.Reader) (*Lexer, error) {
+	input, err := ioutil.ReadAll(r)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	return NewFromBytes(input), nil
+}
+
+// Open returns a new scanner lexing from path.
+func Open(path string) (*Lexer, error) {
+	input, err := ioutil.ReadFile(path)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	return NewFromBytes(input), nil
+}
+
+// NewFromString returns a new scanner lexing from input.
+func NewFromString(input string) *Lexer {
+	return NewFromBytes([]byte(input))
+}
+
+// NewFromBytes returns a new scanner lexing from input.
+func NewFromBytes(input []byte) *Lexer {
+	return &Lexer{input: input}
+}
+
+// Scan lexes and returns the next token of the source input.
+func (l *Lexer) Scan() (*token.Token, error) {
+	// Handle EOF.
+	if l.pos >= len(l.input) {
+		return nil, errors.WithStack(io.EOF)
+	}
+	input := l.input[l.pos:]
+	// Identify token locations matching start of input.
+	loc, err := tokenLocs(input)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	n, id, err := locateTokens(input, loc)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	lit := input[:n]
+	tok := &token.Token{
+		Pos: l.pos,
+		ID:  id,
+		Lit: lit,
+	}
+	l.pos += n
+	return tok, nil
+}
+
+// locateTokens searches for the longest token that match the start of the
+// input.
+func locateTokens(input []byte, loc []int) (n int, id token.ID, err error) {
+	n = -1
+	for i := 0; i < len(token.IDs); i++ {
+		start := loc[2*i]
+		if start == -1 {
+			continue
+		}
+		if start != 0 {
+			return 0, 0, errors.Errorf("invalid start index of token; expected 0, got %d", start)
+		}
+		end := loc[2*i+1]
+		if n != -1 {
+			return 0, 0, errors.Errorf("ambiguity detected; input matches both token %q and token %q", input[:n], input[:end])
+		}
+		n = end
+		id = token.ID(i)
+	}
+	if n == -1 {
+		// no matching token located.
+		return 0, 0, errors.Errorf("unable to identify valid token at %q", input)
+	}
+	return n, id, nil
+}
+
+// tokenLocs returns start and end location of each token types that match the
+// start of the input.
+func tokenLocs(input []byte) ([]int, error) {
+	loc := reg.FindSubmatchIndex(input)
+	if loc == nil {
+		// no submatch located.
+		return nil, errors.Errorf("unable to identify valid token at %q", input)
+	}
+	// Validate submatch indices length; expecting two indices - start and end -
+	// per submatch, and in total 2 + (number of tokens) submatches.
+	got := len(loc)
+	want := 2 * (2 + len(token.IDs))
+	if got != want {
+		return nil, errors.Errorf("invalid number of submatches; expected %d, got %d", want, got)
+	}
+	// Skip the first two submatches as they do not identify specific tokens.
+	loc = loc[2*2:]
+	return loc, nil
+}
diff --git a/examples/uc/testdata/input.c b/examples/uc/testdata/input.c
@@ -0,0 +1,75 @@
+// This program illustrates the quick sort algorithm by sorting an
+// array of char and printing the intermediate results.
+//
+// Adapted from N.Wirth: Algorithms + Data Structures = Programs
+
+
+void putstring(char s[]);
+
+char eol[2];
+int n;
+
+
+void sort(char a[], int l, int r) {
+  int i;
+  int j;
+  char x;
+  char w;
+
+
+  i = l;
+  j = r;
+  x = a[(l+r) / 2];
+
+  while ( i<= j) {
+    while (a[i] < x) i = i + 1;
+    while (x < a[j]) j = j - 1;
+    if (i<= j) {
+      w = a[i];
+      a[i] = a[j];
+      a[j] = w;
+      i = i + 1;
+      j = j - 1;
+    }
+  }
+
+  putstring (a);
+  putstring (eol);
+  if (l < j) sort(a, l,j);
+  if (i < r) sort(a, i, r);
+
+}
+
+int main(void)
+{
+  char s[27];
+  int i;
+  char t;
+  int q;
+
+  eol[0] = '\n';
+  eol[1] = 0;
+
+  n = 26;
+
+  s[n] = 0;
+
+  i = 0;
+
+  // Fill the string with random-looking data
+  q = 11;
+  while (i<n) {
+    t = q - (q / 26)*26;
+    s[i] = 'a'+t;
+    i = i + 1;
+    q = q + 17;
+  }
+
+
+  putstring (s); // print it ...
+  putstring (eol);
+  sort(s, 0, n-1); // sort it ...
+  putstring(s);  // and print again
+  putstring (eol);
+
+}
diff --git a/examples/uc/token/token.go b/examples/uc/token/token.go
@@ -0,0 +1,68 @@
+// generated by speak; DO NOT EDIT.
+
+// Package token defines constants representing the lexical tokens of the source
+// language.
+package token
+
+import "fmt"
+
+// A Token represents a lexical token of the source language.
+type Token struct {
+	// Start position in the source input.
+	Pos int
+	// Token type.
+	ID ID
+	// Token literal.
+	Lit []byte
+}
+
+// String returns the string represenatation of the token.
+func (tok *Token) String() string {
+	return fmt.Sprintf("Pos: %d, ID: %s, Lit: %q", tok.Pos, tok.ID, tok.Lit)
+}
+
+// ID is the set of lexical tokens of the source language.
+type ID uint
+
+// String returns the string represenatation of the token ID.
+func (id ID) String() string {
+	if int(id) < len(IDs) {
+		return IDs[id]
+	}
+	return fmt.Sprintf("<unknown token ID %d>", uint(id))
+}
+
+// IDs specifies the string representation of each token ID.
+var IDs = [...]string{
+	"name(0, `char_lit`)",
+	"name(1, `ident`)",
+	"name(2, `int_lit`)",
+	"token(3, `!`)",
+	"token(4, `!=`)",
+	"token(5, `&&`)",
+	"token(6, `(`)",
+	"token(7, `)`)",
+	"token(8, `*`)",
+	"token(9, `+`)",
+	"token(10, `,`)",
+	"token(11, `-`)",
+	"token(12, `/`)",
+	"token(13, `;`)",
+	"token(14, `<`)",
+	"token(15, `<=`)",
+	"token(16, `=`)",
+	"token(17, `==`)",
+	"token(18, `>`)",
+	"token(19, `>=`)",
+	"token(20, `[`)",
+	"token(21, `]`)",
+	"token(22, `else`)",
+	"token(23, `if`)",
+	"token(24, `return`)",
+	"token(25, `typedef`)",
+	"token(26, `while`)",
+	"token(27, `{`)",
+	"token(28, `}`)",
+	"skip(29, `comment`)",
+	"skip(30, `whitespace`)",
+}