From 476be9c3357ce4b0a1bce1b6dfe9f1f46d890ba8 Mon Sep 17 00:00:00 2001
From: mewmew <rnd0x00@gmail.com>
Date: Mon, 13 Feb 2017 16:03:02 +0100
Subject: [PATCH] examples/uc: Add uc example to show the interaction between
 terms and genlex.

---
 examples/uc/Makefile         |  15 ++
 examples/uc/lexer/lexer.go   | 135 +++++++++++++++
 examples/uc/testdata/input.c |  75 +++++++++
 examples/uc/token/token.go   |  68 ++++++++
 examples/uc/uc.ebnf          | 316 +++++++++++++++++++++++++++++++++++
 examples/uc/uc.json          | 132 +++++++++++++++
 6 files changed, 741 insertions(+)
 create mode 100644 examples/uc/Makefile
 create mode 100644 examples/uc/lexer/lexer.go
 create mode 100644 examples/uc/testdata/input.c
 create mode 100644 examples/uc/token/token.go
 create mode 100644 examples/uc/uc.ebnf
 create mode 100644 examples/uc/uc.json

diff --git a/examples/uc/Makefile b/examples/uc/Makefile
new file mode 100644
index 0000000..cad02fa
--- /dev/null
+++ b/examples/uc/Makefile
@@ -0,0 +1,15 @@
+all: lexer
+
+# uc.ebnf -> uc.json
+%.json: %.ebnf
+	terms -indent -start File -skip "whitespace,comment" -o $@ $<
+
+# uc.json -> lexer and token packages
+lexer: uc.json
+	genlex $<
+
+clean:
+	rm -rf token lexer
+	rm -f uc.json
+
+.PHONY: all clean
diff --git a/examples/uc/lexer/lexer.go b/examples/uc/lexer/lexer.go
new file mode 100644
index 0000000..43a7944
--- /dev/null
+++ b/examples/uc/lexer/lexer.go
@@ -0,0 +1,135 @@
+// generated by speak; DO NOT EDIT.
+
+// Package lexer implements lexical analysis of the source language.
+package lexer
+
+import (
+	"io"
+	"io/ioutil"
+	"regexp"
+
+	"github.com/mewmew/speak/examples/uc/token"
+	"github.com/pkg/errors"
+)
+
+// regstr specifies a regular expression for identifying the tokens of the input
+// grammar.
+const regstr = `^(('(?:\\n|a)')|([A-Z_a-z][0-9A-Z_a-z]*)|([0-9][0-9]*)|(!)|(!=)|(&&)|(\()|(\))|(\*)|(\+)|(,)|(-)|(/)|(;)|(<)|(<=)|(=)|(==)|(>)|(>=)|(\[)|(\])|(else)|(if)|(return)|(typedef)|(while)|(\{)|(\})|(//(?-s:.)*\n|#(?-s:.)*\n|/\*[^\*]*\*/)|([\t-\r ]))`
+
+// reg is a compiled version of regstr with leftmost-longest matching enabled.
+var reg *regexp.Regexp
+
+func init() {
+	// Compile regexp for identifying tokens and enforce leftmost-longest
+	// matching.
+	reg = regexp.MustCompile(regstr)
+	reg.Longest()
+}
+
+// A Lexer lexes the source input into a slice of tokens.
+type Lexer struct {
+	// Source input.
+	input []byte
+	// Current position in the source input.
+	pos int
+}
+
+// New returns a new scanner lexing from r.
+func New(r io.Reader) (*Lexer, error) {
+	input, err := ioutil.ReadAll(r)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	return NewFromBytes(input), nil
+}
+
+// Open returns a new scanner lexing from path.
+func Open(path string) (*Lexer, error) {
+	input, err := ioutil.ReadFile(path)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	return NewFromBytes(input), nil
+}
+
+// NewFromString returns a new scanner lexing from input.
+func NewFromString(input string) *Lexer {
+	return NewFromBytes([]byte(input))
+}
+
+// NewFromBytes returns a new scanner lexing from input.
+func NewFromBytes(input []byte) *Lexer {
+	return &Lexer{input: input}
+}
+
+// Scan lexes and returns the next token of the source input.
+func (l *Lexer) Scan() (*token.Token, error) {
+	// Handle EOF.
+	if l.pos >= len(l.input) {
+		return nil, errors.WithStack(io.EOF)
+	}
+	input := l.input[l.pos:]
+	// Identify token locations matching start of input.
+	loc, err := tokenLocs(input)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	n, id, err := locateTokens(input, loc)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	lit := input[:n]
+	tok := &token.Token{
+		Pos: l.pos,
+		ID:  id,
+		Lit: lit,
+	}
+	l.pos += n
+	return tok, nil
+}
+
+// locateTokens searches for the longest token that match the start of the
+// input.
+func locateTokens(input []byte, loc []int) (n int, id token.ID, err error) {
+	n = -1
+	for i := 0; i < len(token.IDs); i++ {
+		start := loc[2*i]
+		if start == -1 {
+			continue
+		}
+		if start != 0 {
+			return 0, 0, errors.Errorf("invalid start index of token; expected 0, got %d", start)
+		}
+		end := loc[2*i+1]
+		if n != -1 {
+			return 0, 0, errors.Errorf("ambiguity detected; input matches both token %q and token %q", input[:n], input[:end])
+		}
+		n = end
+		id = token.ID(i)
+	}
+	if n == -1 {
+		// no matching token located.
+		return 0, 0, errors.Errorf("unable to identify valid token at %q", input)
+	}
+	return n, id, nil
+}
+
+// tokenLocs returns start and end location of each token types that match the
+// start of the input.
+func tokenLocs(input []byte) ([]int, error) {
+	loc := reg.FindSubmatchIndex(input)
+	if loc == nil {
+		// no submatch located.
+		return nil, errors.Errorf("unable to identify valid token at %q", input)
+	}
+	// Validate submatch indices length; expecting two indices - start and end -
+	// per submatch, and in total 2 + (number of tokens) submatches.
+	got := len(loc)
+	want := 2 * (2 + len(token.IDs))
+	if got != want {
+		return nil, errors.Errorf("invalid number of submatches; expected %d, got %d", want, got)
+	}
+	// Skip the first two submatches as they do not identify specific tokens.
+	loc = loc[2*2:]
+	return loc, nil
+}
diff --git a/examples/uc/testdata/input.c b/examples/uc/testdata/input.c
new file mode 100644
index 0000000..c1e4d93
--- /dev/null
+++ b/examples/uc/testdata/input.c
@@ -0,0 +1,75 @@
+// This program illustrates the quick sort algorithm by sorting an
+// array of char and printing the intermediate results.
+//
+// Adapted from N.Wirth: Algorithms + Data Structures = Programs
+
+
+void putstring(char s[]);
+
+char eol[2];
+int n;
+
+
+void sort(char a[], int l, int r) {
+  int i;
+  int j;
+  char x;
+  char w;
+
+
+  i = l;
+  j = r;
+  x = a[(l+r) / 2];
+  
+  while ( i<= j) {
+    while (a[i] < x) i = i + 1;
+    while (x < a[j]) j = j - 1;
+    if (i<= j) {
+      w = a[i];
+      a[i] = a[j];
+      a[j] = w;
+      i = i + 1;
+      j = j - 1;
+    }
+  }
+
+  putstring (a);
+  putstring (eol);
+  if (l < j) sort(a, l,j);
+  if (i < r) sort(a, i, r);
+
+}
+
+int main(void)
+{
+  char s[27];
+  int i;
+  char t;
+  int q;
+
+  eol[0] = '\n';
+  eol[1] = 0;
+
+  n = 26;
+
+  s[n] = 0;
+
+  i = 0;
+
+  // Fill the string with random-looking data
+  q = 11;
+  while (i<n) {
+    t = q - (q / 26)*26;
+    s[i] = 'a'+t;
+    i = i + 1;
+    q = q + 17;
+  }
+
+
+  putstring (s); // print it ...
+  putstring (eol);
+  sort(s, 0, n-1); // sort it ...
+  putstring(s);  // and print again
+  putstring (eol);
+
+}
diff --git a/examples/uc/token/token.go b/examples/uc/token/token.go
new file mode 100644
index 0000000..49ecef6
--- /dev/null
+++ b/examples/uc/token/token.go
@@ -0,0 +1,68 @@
+// generated by speak; DO NOT EDIT.
+
+// Package token defines constants representing the lexical tokens of the source
+// language.
+package token
+
+import "fmt"
+
+// A Token represents a lexical token of the source language.
+type Token struct {
+	// Start position in the source input.
+	Pos int
+	// Token type.
+	ID ID
+	// Token literal.
+	Lit []byte
+}
+
+// String returns the string represenatation of the token.
+func (tok *Token) String() string {
+	return fmt.Sprintf("Pos: %d, ID: %s, Lit: %q", tok.Pos, tok.ID, tok.Lit)
+}
+
+// ID is the set of lexical tokens of the source language.
+type ID uint
+
+// String returns the string represenatation of the token ID.
+func (id ID) String() string {
+	if int(id) < len(IDs) {
+		return IDs[id]
+	}
+	return fmt.Sprintf("<unknown token ID %d>", uint(id))
+}
+
+// IDs specifies the string representation of each token ID.
+var IDs = [...]string{
+	"name(0, `char_lit`)",
+	"name(1, `ident`)",
+	"name(2, `int_lit`)",
+	"token(3, `!`)",
+	"token(4, `!=`)",
+	"token(5, `&&`)",
+	"token(6, `(`)",
+	"token(7, `)`)",
+	"token(8, `*`)",
+	"token(9, `+`)",
+	"token(10, `,`)",
+	"token(11, `-`)",
+	"token(12, `/`)",
+	"token(13, `;`)",
+	"token(14, `<`)",
+	"token(15, `<=`)",
+	"token(16, `=`)",
+	"token(17, `==`)",
+	"token(18, `>`)",
+	"token(19, `>=`)",
+	"token(20, `[`)",
+	"token(21, `]`)",
+	"token(22, `else`)",
+	"token(23, `if`)",
+	"token(24, `return`)",
+	"token(25, `typedef`)",
+	"token(26, `while`)",
+	"token(27, `{`)",
+	"token(28, `}`)",
+	"skip(29, `comment`)",
+	"skip(30, `whitespace`)",
+}
diff --git a/examples/uc/uc.ebnf b/examples/uc/uc.ebnf
new file mode 100644
index 0000000..7c5db59
--- /dev/null
+++ b/examples/uc/uc.ebnf
@@ -0,0 +1,316 @@
+// A grammar for the µC programming language [1].
+//
+// The content and structure of this document is heavily influenced by the Go
+// Programming Language Specification [2] and some parts are therefore governed
+// by a BSD-style license [3]. Any original content of this document is hereby
+// released into the public domain [4].
+//
+// References:
+//    [1]: https://www.it.uu.se/katalog/aleji304/CompilersProject/uc.html
+//    [2]: http://golang.org/ref/spec
+//    [3]: http://golang.org/LICENSE
+//    [4]: https://creativecommons.org/publicdomain/zero/1.0/
+
+// # Source code representation
+//
+
+// ## Characters
+//
+
+// An arbitrary ASCII character except null (0x00), new lines (0x0A), carriage
+// return (0x0D), apostrophe (0x27), double quote (0x22) and backslash (0x5C).
+//_ascii_char
+//	= "\x01" … "\x09"
+//	| "\x0B" … "\x0C"
+//	| "\x0E" … "\x21"
+//	| "\x23" … "\x26"
+//	| "\x28" … "\x5B"
+//	| "\x5D" … "\x7F"
+//.
+
+_ascii_letter = "a" … "z" | "A" … "Z" .
+_ascii_digit  = "0" … "9" .
+
+// ## Letters and digits
+//
+
+_letter        = _ascii_letter | "_" .
+_decimal_digit = _ascii_digit .
+_decimals      = _decimal_digit { _decimal_digit } .
+
+// # Lexical elements
+//
+
+// ## Comments
+//
+
+_not_newline = "\x00" … "\x09" | "\x0B" … "\U0010FFFF" .
+_not_star = "\x00" … "\x29" | "\x2B" … "\U0010FFFF" .
+
+_line_comment
+	= "/" "/" { _not_newline } "\n"
+	// TODO: Implement proper support for preprocess directive.
+	| "#"  { _not_newline } "\n"
+.
+_block_comment = "/" "*" { _not_star } "*" "/" .
+comment        = _line_comment | _block_comment .
+
+// ## Tokens
+//
+
+// White space, formed from spaces (0x20), horizontal tabs (0x09), new line
+// (line-feed (0x0A) or carriage-return (0x0D)), vertical tabs (0x0B), and form-
+// feeds (0x0C) (§6.4), is ignored except as it separates tokens that would
+// otherwise combine into a single token.
+whitespace = " " | "\t" | "\v" | "\f" | "\r" | "\n" .
+
+// ## Identifiers
+//
+
+ident = _letter { _letter | _decimal_digit } .
+
+// ## Integer literals
+//
+
+int_lit = _decimals .
+
+// ## Character literals
+//
+
+_escaped_char = "\\" "n" .
+//char_lit      = "'" ( _ascii_char | "\"" | _escaped_char ) "'" .
+char_lit = "'" ( _escaped_char | "a" ) "'" .
+//_char = _escaped_char | "a" .
+//char_lit = "'" _char "'" .
+
+// # Syntaxic production rules
+//
+
+File
+	= Decls
+.
+
+Decls = [ DeclList ] .
+
+DeclList
+	= Decl
+	| DeclList Decl
+.
+
+Decl
+	= VarDecl ";"
+	| FuncDecl ";"
+	| FuncDef
+	| TypeDef ";"
+.
+
+FuncDecl
+	= FuncHeader
+.
+
+FuncHeader
+	// BasicType : "char" | "int" | "void" ;
+	= BasicType ident "(" Params ")"
+.
+
+FuncDef
+	= FuncHeader BlockStmt
+.
+
+VarDecl
+	= ScalarDecl
+	| ArrayDecl
+.
+
+ScalarDecl
+	// BasicType : "char" | "int" ;
+	= BasicType ident
+.
+
+ArrayDecl
+	// BasicType : "char" | "int" ;
+	= BasicType ident "[" IntLit "]"
+	| BasicType ident "[" "]"
+.
+
+IntLit
+	= int_lit
+	| char_lit
+.
+
+TypeDef
+	= "typedef" Type ident
+.
+
+BasicType
+	// BasicType : "char" | "int" | "void" ;
+	= ident
+.
+
+Params = [ ParamList ] .
+
+ParamList
+	= Param
+	| ParamList "," Param
+.
+
+Param
+	// BasicType : "void" ;
+	= Type
+	| VarDecl
+.
+
+// TODO: Add support for array types.
+Type
+	= BasicType
+.
+
+Stmt
+	= MatchedStmt
+	| OpenStmt
+.
+
+// Thanks to http://www.parsifalsoft.com/ifelse.html for loop statement
+// resolvning (while, do, for).
+
+OtherStmt
+	= Expr ";"
+	| "return" Expr ";"
+	| "return" ";"
+	| BlockStmt
+	| ";"
+.
+
+BlockStmt
+	= "{" BlockItems "}"
+.
+
+MatchedStmt
+	= "if" Condition MatchedStmt
+	  "else" MatchedStmt
+	| "while" Condition MatchedStmt
+	| OtherStmt
+.
+
+OpenStmt
+	= "if" Condition Stmt
+	| "if" Condition MatchedStmt
+	  "else" OpenStmt
+	| "while" Condition OpenStmt
+.
+
+Condition
+	= "(" Expr ")"
+.
+
+BlockItems = [ BlockItemList ] .
+
+BlockItemList
+	= BlockItem
+	| BlockItemList BlockItem
+.
+
+BlockItem
+	= Decl
+	| Stmt
+.
+
+Expr
+	= Expr2R
+.
+
+// Right-associative binary expressions with precedence 2.
+//
+//    2R: =
+Expr2R
+	= Expr5L
+	// Right-associative.
+	| Expr5L "=" Expr2R
+.
+
+// Left-associative binary expressions with precedence 5.
+//
+//    5L: &&
+Expr5L
+	= Expr9L
+	| Expr5L "&&" Expr9L
+.
+
+// Left-associative binary expressions with precedence 9.
+//
+//    9L: == !=
+Expr9L
+	= Expr10L
+	| Expr9L "==" Expr10L
+	| Expr9L "!=" Expr10L
+.
+
+// Left-associative binary expressions with precedence 10.
+//
+//    10L: < > <= >=
+Expr10L
+	= Expr12L
+	| Expr10L "<" Expr12L
+	| Expr10L ">" Expr12L
+	| Expr10L "<=" Expr12L
+	| Expr10L ">=" Expr12L
+.
+
+// Left-associative binary expressions with precedence 12.
+//
+//    12L: + -
+Expr12L
+	= Expr13L
+	| Expr12L "+" Expr13L
+	| Expr12L "-" Expr13L
+.
+
+// Left-associative binary expressions with precedence 13.
+//
+//    13L: * /
+Expr13L
+	= Expr14
+	| Expr13L "*" Expr14
+	| Expr13L "/" Expr14
+.
+
+// Unary expressions with precedence 14.
+//
+//    14: - !
+Expr14
+	= Expr15
+	| "-" Expr14
+	| "!" Expr14
+.
+
+// TODO: Replace function name with expression in call expression. Do the same
+// for array names.
+
+// TODO: Replace Expr15 (and similar names) with CastExpr, PostfixExpr, ...
+// (from the C11 spec).
+
+// Expressions with precedence 15.
+Expr15
+	= PrimaryExpr
+	| ident "[" Expr "]"
+	| ident "(" Args ")"
+.
+
+// Primary expressions with the highest precedence (§A.2.1).
+PrimaryExpr
+	= int_lit
+	| char_lit
+	| ident
+	| ParenExpr
+.
+
+ParenExpr
+	= "(" Expr ")"
+.
+
+Args = [ ExprList ] .
+
+ExprList
+	= Expr
+	| ExprList "," Expr
+.
diff --git a/examples/uc/uc.json b/examples/uc/uc.json
new file mode 100644
index 0000000..473d414
--- /dev/null
+++ b/examples/uc/uc.json
@@ -0,0 +1,132 @@
+{
+	"names": [
+		{
+			"id": "char_lit",
+			"reg": "'(?:\\\\n|a)'"
+		},
+		{
+			"id": "ident",
+			"reg": "[A-Z_a-z][0-9A-Z_a-z]*"
+		},
+		{
+			"id": "int_lit",
+			"reg": "[0-9][0-9]*"
+		}
+	],
+	"tokens": [
+		{
+			"id": "!",
+			"reg": "!"
+		},
+		{
+			"id": "!=",
+			"reg": "!="
+		},
+		{
+			"id": "\u0026\u0026",
+			"reg": "\u0026\u0026"
+		},
+		{
+			"id": "(",
+			"reg": "\\("
+		},
+		{
+			"id": ")",
+			"reg": "\\)"
+		},
+		{
+			"id": "*",
+			"reg": "\\*"
+		},
+		{
+			"id": "+",
+			"reg": "\\+"
+		},
+		{
+			"id": ",",
+			"reg": ","
+		},
+		{
+			"id": "-",
+			"reg": "-"
+		},
+		{
+			"id": "/",
+			"reg": "/"
+		},
+		{
+			"id": ";",
+			"reg": ";"
+		},
+		{
+			"id": "\u003c",
+			"reg": "\u003c"
+		},
+		{
+			"id": "\u003c=",
+			"reg": "\u003c="
+		},
+		{
+			"id": "=",
+			"reg": "="
+		},
+		{
+			"id": "==",
+			"reg": "=="
+		},
+		{
+			"id": "\u003e",
+			"reg": "\u003e"
+		},
+		{
+			"id": "\u003e=",
+			"reg": "\u003e="
+		},
+		{
+			"id": "[",
+			"reg": "\\["
+		},
+		{
+			"id": "]",
+			"reg": "\\]"
+		},
+		{
+			"id": "else",
+			"reg": "else"
+		},
+		{
+			"id": "if",
+			"reg": "if"
+		},
+		{
+			"id": "return",
+			"reg": "return"
+		},
+		{
+			"id": "typedef",
+			"reg": "typedef"
+		},
+		{
+			"id": "while",
+			"reg": "while"
+		},
+		{
+			"id": "{",
+			"reg": "\\{"
+		},
+		{
+			"id": "}",
+			"reg": "\\}"
+		}
+	],
+	"skip": [
+		{
+			"id": "comment",
+			"reg": "//(?-s:.)*\\n|#(?-s:.)*\\n|/\\*[^\\*]*\\*/"
+		},
+		{
+			"id": "whitespace",
+			"reg": "[\\t-\\r ]"
+		}
+	]
+}