From 476be9c3357ce4b0a1bce1b6dfe9f1f46d890ba8 Mon Sep 17 00:00:00 2001 From: mewmew Date: Mon, 13 Feb 2017 16:03:02 +0100 Subject: [PATCH] examples/uc: Add uc example to show the interaction between terms and genlex. --- examples/uc/Makefile | 15 ++ examples/uc/lexer/lexer.go | 135 +++++++++++++++ examples/uc/testdata/input.c | 75 +++++++++ examples/uc/token/token.go | 68 ++++++++ examples/uc/uc.ebnf | 316 +++++++++++++++++++++++++++++++++++ examples/uc/uc.json | 132 +++++++++++++++ 6 files changed, 741 insertions(+) create mode 100644 examples/uc/Makefile create mode 100644 examples/uc/lexer/lexer.go create mode 100644 examples/uc/testdata/input.c create mode 100644 examples/uc/token/token.go create mode 100644 examples/uc/uc.ebnf create mode 100644 examples/uc/uc.json diff --git a/examples/uc/Makefile b/examples/uc/Makefile new file mode 100644 index 0000000..cad02fa --- /dev/null +++ b/examples/uc/Makefile @@ -0,0 +1,15 @@ +all: lexer + +# uc.ebnf -> uc.json +%.json: %.ebnf + terms -indent -start File -skip "whitespace,comment" -o $@ $< + +# uc.json -> lexer and token packages +lexer: uc.json + genlex $< + +clean: + rm -rf token lexer + rm -f uc.json + +.PHONY: all clean diff --git a/examples/uc/lexer/lexer.go b/examples/uc/lexer/lexer.go new file mode 100644 index 0000000..43a7944 --- /dev/null +++ b/examples/uc/lexer/lexer.go @@ -0,0 +1,135 @@ +// generated by speak; DO NOT EDIT. + +// Package lexer implements lexical analysis of the source language. +package lexer + +import ( + "io" + "io/ioutil" + "regexp" + + "github.com/mewmew/speak/examples/uc/token" + "github.com/pkg/errors" +) + +// regstr specifies a regular expression for identifying the tokens of the input +// grammar. +const regstr = `^(('(?:\\n|a)')|([A-Z_a-z][0-9A-Z_a-z]*)|([0-9][0-9]*)|(!)|(!=)|(&&)|(\()|(\))|(\*)|(\+)|(,)|(-)|(/)|(;)|(<)|(<=)|(=)|(==)|(>)|(>=)|(\[)|(\])|(else)|(if)|(return)|(typedef)|(while)|(\{)|(\})|(//(?-s:.)*\n|#(?-s:.)*\n|/\*[^\*]*\*/)|([\t-\r ]))` + +// reg is a compiled version of regstr with leftmost-longest matching enabled. +var reg *regexp.Regexp + +func init() { + // Compile regexp for identifying tokens and enforce leftmost-longest + // matching. + reg = regexp.MustCompile(regstr) + reg.Longest() +} + +// A Lexer lexes the source input into a slice of tokens. +type Lexer struct { + // Source input. + input []byte + // Current position in the source input. + pos int +} + +// New returns a new scanner lexing from r. +func New(r io.Reader) (*Lexer, error) { + input, err := ioutil.ReadAll(r) + if err != nil { + return nil, errors.WithStack(err) + } + return NewFromBytes(input), nil +} + +// Open returns a new scanner lexing from path. +func Open(path string) (*Lexer, error) { + input, err := ioutil.ReadFile(path) + if err != nil { + return nil, errors.WithStack(err) + } + return NewFromBytes(input), nil +} + +// NewFromString returns a new scanner lexing from input. +func NewFromString(input string) *Lexer { + return NewFromBytes([]byte(input)) +} + +// NewFromBytes returns a new scanner lexing from input. +func NewFromBytes(input []byte) *Lexer { + return &Lexer{input: input} +} + +// Scan lexes and returns the next token of the source input. +func (l *Lexer) Scan() (*token.Token, error) { + // Handle EOF. + if l.pos >= len(l.input) { + return nil, errors.WithStack(io.EOF) + } + input := l.input[l.pos:] + // Identify token locations matching start of input. + loc, err := tokenLocs(input) + if err != nil { + return nil, errors.WithStack(err) + } + n, id, err := locateTokens(input, loc) + if err != nil { + return nil, errors.WithStack(err) + } + lit := input[:n] + tok := &token.Token{ + Pos: l.pos, + ID: id, + Lit: lit, + } + l.pos += n + return tok, nil +} + +// locateTokens searches for the longest token that match the start of the +// input. +func locateTokens(input []byte, loc []int) (n int, id token.ID, err error) { + n = -1 + for i := 0; i < len(token.IDs); i++ { + start := loc[2*i] + if start == -1 { + continue + } + if start != 0 { + return 0, 0, errors.Errorf("invalid start index of token; expected 0, got %d", start) + } + end := loc[2*i+1] + if n != -1 { + return 0, 0, errors.Errorf("ambiguity detected; input matches both token %q and token %q", input[:n], input[:end]) + } + n = end + id = token.ID(i) + } + if n == -1 { + // no matching token located. + return 0, 0, errors.Errorf("unable to identify valid token at %q", input) + } + return n, id, nil +} + +// tokenLocs returns start and end location of each token types that match the +// start of the input. +func tokenLocs(input []byte) ([]int, error) { + loc := reg.FindSubmatchIndex(input) + if loc == nil { + // no submatch located. + return nil, errors.Errorf("unable to identify valid token at %q", input) + } + // Validate submatch indices length; expecting two indices - start and end - + // per submatch, and in total 2 + (number of tokens) submatches. + got := len(loc) + want := 2 * (2 + len(token.IDs)) + if got != want { + return nil, errors.Errorf("invalid number of submatches; expected %d, got %d", want, got) + } + // Skip the first two submatches as they do not identify specific tokens. + loc = loc[2*2:] + return loc, nil +} diff --git a/examples/uc/testdata/input.c b/examples/uc/testdata/input.c new file mode 100644 index 0000000..c1e4d93 --- /dev/null +++ b/examples/uc/testdata/input.c @@ -0,0 +1,75 @@ +// This program illustrates the quick sort algorithm by sorting an +// array of char and printing the intermediate results. +// +// Adapted from N.Wirth: Algorithms + Data Structures = Programs + + +void putstring(char s[]); + +char eol[2]; +int n; + + +void sort(char a[], int l, int r) { + int i; + int j; + char x; + char w; + + + i = l; + j = r; + x = a[(l+r) / 2]; + + while ( i<= j) { + while (a[i] < x) i = i + 1; + while (x < a[j]) j = j - 1; + if (i<= j) { + w = a[i]; + a[i] = a[j]; + a[j] = w; + i = i + 1; + j = j - 1; + } + } + + putstring (a); + putstring (eol); + if (l < j) sort(a, l,j); + if (i < r) sort(a, i, r); + +} + +int main(void) +{ + char s[27]; + int i; + char t; + int q; + + eol[0] = '\n'; + eol[1] = 0; + + n = 26; + + s[n] = 0; + + i = 0; + + // Fill the string with random-looking data + q = 11; + while (i", uint(id)) +} + +// IDs specifies the string representation of each token ID. +var IDs = [...]string{ + "name(0, `char_lit`)", + "name(1, `ident`)", + "name(2, `int_lit`)", + "token(3, `!`)", + "token(4, `!=`)", + "token(5, `&&`)", + "token(6, `(`)", + "token(7, `)`)", + "token(8, `*`)", + "token(9, `+`)", + "token(10, `,`)", + "token(11, `-`)", + "token(12, `/`)", + "token(13, `;`)", + "token(14, `<`)", + "token(15, `<=`)", + "token(16, `=`)", + "token(17, `==`)", + "token(18, `>`)", + "token(19, `>=`)", + "token(20, `[`)", + "token(21, `]`)", + "token(22, `else`)", + "token(23, `if`)", + "token(24, `return`)", + "token(25, `typedef`)", + "token(26, `while`)", + "token(27, `{`)", + "token(28, `}`)", + "skip(29, `comment`)", + "skip(30, `whitespace`)", +} diff --git a/examples/uc/uc.ebnf b/examples/uc/uc.ebnf new file mode 100644 index 0000000..7c5db59 --- /dev/null +++ b/examples/uc/uc.ebnf @@ -0,0 +1,316 @@ +// A grammar for the µC programming language [1]. +// +// The content and structure of this document is heavily influenced by the Go +// Programming Language Specification [2] and some parts are therefore governed +// by a BSD-style license [3]. Any original content of this document is hereby +// released into the public domain [4]. +// +// References: +// [1]: https://www.it.uu.se/katalog/aleji304/CompilersProject/uc.html +// [2]: http://golang.org/ref/spec +// [3]: http://golang.org/LICENSE +// [4]: https://creativecommons.org/publicdomain/zero/1.0/ + +// # Source code representation +// + +// ## Characters +// + +// An arbitrary ASCII character except null (0x00), new lines (0x0A), carriage +// return (0x0D), apostrophe (0x27), double quote (0x22) and backslash (0x5C). +//_ascii_char +// = "\x01" … "\x09" +// | "\x0B" … "\x0C" +// | "\x0E" … "\x21" +// | "\x23" … "\x26" +// | "\x28" … "\x5B" +// | "\x5D" … "\x7F" +//. + +_ascii_letter = "a" … "z" | "A" … "Z" . +_ascii_digit = "0" … "9" . + +// ## Letters and digits +// + +_letter = _ascii_letter | "_" . +_decimal_digit = _ascii_digit . +_decimals = _decimal_digit { _decimal_digit } . + +// # Lexical elements +// + +// ## Comments +// + +_not_newline = "\x00" … "\x09" | "\x0B" … "\U0010FFFF" . +_not_star = "\x00" … "\x29" | "\x2B" … "\U0010FFFF" . + +_line_comment + = "/" "/" { _not_newline } "\n" + // TODO: Implement proper support for preprocess directive. + | "#" { _not_newline } "\n" +. +_block_comment = "/" "*" { _not_star } "*" "/" . +comment = _line_comment | _block_comment . + +// ## Tokens +// + +// White space, formed from spaces (0x20), horizontal tabs (0x09), new line +// (line-feed (0x0A) or carriage-return (0x0D)), vertical tabs (0x0B), and form- +// feeds (0x0C) (§6.4), is ignored except as it separates tokens that would +// otherwise combine into a single token. +whitespace = " " | "\t" | "\v" | "\f" | "\r" | "\n" . + +// ## Identifiers +// + +ident = _letter { _letter | _decimal_digit } . + +// ## Integer literals +// + +int_lit = _decimals . + +// ## Character literals +// + +_escaped_char = "\\" "n" . +//char_lit = "'" ( _ascii_char | "\"" | _escaped_char ) "'" . +char_lit = "'" ( _escaped_char | "a" ) "'" . +//_char = _escaped_char | "a" . +//char_lit = "'" _char "'" . + +// # Syntaxic production rules +// + +File + = Decls +. + +Decls = [ DeclList ] . + +DeclList + = Decl + | DeclList Decl +. + +Decl + = VarDecl ";" + | FuncDecl ";" + | FuncDef + | TypeDef ";" +. + +FuncDecl + = FuncHeader +. + +FuncHeader + // BasicType : "char" | "int" | "void" ; + = BasicType ident "(" Params ")" +. + +FuncDef + = FuncHeader BlockStmt +. + +VarDecl + = ScalarDecl + | ArrayDecl +. + +ScalarDecl + // BasicType : "char" | "int" ; + = BasicType ident +. + +ArrayDecl + // BasicType : "char" | "int" ; + = BasicType ident "[" IntLit "]" + | BasicType ident "[" "]" +. + +IntLit + = int_lit + | char_lit +. + +TypeDef + = "typedef" Type ident +. + +BasicType + // BasicType : "char" | "int" | "void" ; + = ident +. + +Params = [ ParamList ] . + +ParamList + = Param + | ParamList "," Param +. + +Param + // BasicType : "void" ; + = Type + | VarDecl +. + +// TODO: Add support for array types. +Type + = BasicType +. + +Stmt + = MatchedStmt + | OpenStmt +. + +// Thanks to http://www.parsifalsoft.com/ifelse.html for loop statement +// resolvning (while, do, for). + +OtherStmt + = Expr ";" + | "return" Expr ";" + | "return" ";" + | BlockStmt + | ";" +. + +BlockStmt + = "{" BlockItems "}" +. + +MatchedStmt + = "if" Condition MatchedStmt + "else" MatchedStmt + | "while" Condition MatchedStmt + | OtherStmt +. + +OpenStmt + = "if" Condition Stmt + | "if" Condition MatchedStmt + "else" OpenStmt + | "while" Condition OpenStmt +. + +Condition + = "(" Expr ")" +. + +BlockItems = [ BlockItemList ] . + +BlockItemList + = BlockItem + | BlockItemList BlockItem +. + +BlockItem + = Decl + | Stmt +. + +Expr + = Expr2R +. + +// Right-associative binary expressions with precedence 2. +// +// 2R: = +Expr2R + = Expr5L + // Right-associative. + | Expr5L "=" Expr2R +. + +// Left-associative binary expressions with precedence 5. +// +// 5L: && +Expr5L + = Expr9L + | Expr5L "&&" Expr9L +. + +// Left-associative binary expressions with precedence 9. +// +// 9L: == != +Expr9L + = Expr10L + | Expr9L "==" Expr10L + | Expr9L "!=" Expr10L +. + +// Left-associative binary expressions with precedence 10. +// +// 10L: < > <= >= +Expr10L + = Expr12L + | Expr10L "<" Expr12L + | Expr10L ">" Expr12L + | Expr10L "<=" Expr12L + | Expr10L ">=" Expr12L +. + +// Left-associative binary expressions with precedence 12. +// +// 12L: + - +Expr12L + = Expr13L + | Expr12L "+" Expr13L + | Expr12L "-" Expr13L +. + +// Left-associative binary expressions with precedence 13. +// +// 13L: * / +Expr13L + = Expr14 + | Expr13L "*" Expr14 + | Expr13L "/" Expr14 +. + +// Unary expressions with precedence 14. +// +// 14: - ! +Expr14 + = Expr15 + | "-" Expr14 + | "!" Expr14 +. + +// TODO: Replace function name with expression in call expression. Do the same +// for array names. + +// TODO: Replace Expr15 (and similar names) with CastExpr, PostfixExpr, ... +// (from the C11 spec). + +// Expressions with precedence 15. +Expr15 + = PrimaryExpr + | ident "[" Expr "]" + | ident "(" Args ")" +. + +// Primary expressions with the highest precedence (§A.2.1). +PrimaryExpr + = int_lit + | char_lit + | ident + | ParenExpr +. + +ParenExpr + = "(" Expr ")" +. + +Args = [ ExprList ] . + +ExprList + = Expr + | ExprList "," Expr +. diff --git a/examples/uc/uc.json b/examples/uc/uc.json new file mode 100644 index 0000000..473d414 --- /dev/null +++ b/examples/uc/uc.json @@ -0,0 +1,132 @@ +{ + "names": [ + { + "id": "char_lit", + "reg": "'(?:\\\\n|a)'" + }, + { + "id": "ident", + "reg": "[A-Z_a-z][0-9A-Z_a-z]*" + }, + { + "id": "int_lit", + "reg": "[0-9][0-9]*" + } + ], + "tokens": [ + { + "id": "!", + "reg": "!" + }, + { + "id": "!=", + "reg": "!=" + }, + { + "id": "\u0026\u0026", + "reg": "\u0026\u0026" + }, + { + "id": "(", + "reg": "\\(" + }, + { + "id": ")", + "reg": "\\)" + }, + { + "id": "*", + "reg": "\\*" + }, + { + "id": "+", + "reg": "\\+" + }, + { + "id": ",", + "reg": "," + }, + { + "id": "-", + "reg": "-" + }, + { + "id": "/", + "reg": "/" + }, + { + "id": ";", + "reg": ";" + }, + { + "id": "\u003c", + "reg": "\u003c" + }, + { + "id": "\u003c=", + "reg": "\u003c=" + }, + { + "id": "=", + "reg": "=" + }, + { + "id": "==", + "reg": "==" + }, + { + "id": "\u003e", + "reg": "\u003e" + }, + { + "id": "\u003e=", + "reg": "\u003e=" + }, + { + "id": "[", + "reg": "\\[" + }, + { + "id": "]", + "reg": "\\]" + }, + { + "id": "else", + "reg": "else" + }, + { + "id": "if", + "reg": "if" + }, + { + "id": "return", + "reg": "return" + }, + { + "id": "typedef", + "reg": "typedef" + }, + { + "id": "while", + "reg": "while" + }, + { + "id": "{", + "reg": "\\{" + }, + { + "id": "}", + "reg": "\\}" + } + ], + "skip": [ + { + "id": "comment", + "reg": "//(?-s:.)*\\n|#(?-s:.)*\\n|/\\*[^\\*]*\\*/" + }, + { + "id": "whitespace", + "reg": "[\\t-\\r ]" + } + ] +}