feat: lalrpop lexer prototype (#4656)

# Description ## Problem\* Resolves #4655 ## Summary\* NOTE: I've added `CC0-1.0` to `deny.toml`. Its purpose is to release work to the public domain. ### TL;DR [`lalrpop`](https://github.com/lalrpop/lalrpop) generates a part of a parser for our grammar and the `noirc_frontend/experimental_parser` feature can be used to run it alongside the existing parser: when the existing `chumsky` parser succeeds, the `lalrpop` parser is run and the results compared. ### Detail Run with `cargo test --features=noirc_frontend/experimental_parser` Originally investigating lalrpop for _lexing_, now for _parsing_: - lalrpop doesn't handle string literals with escape characters well + this is due to how LR(1) parsers handle context + I realized this after discovering that all of the larger examples (solidity, python, etc) use the lexer to handle strings - pivoted this PR to use our existing lexer + this entailed a version of `Token` without `String` so that e.g. `Token::Str("hi")` can be matched on. (which is essential to lalrpop) + currently using a shim to minimize impact to the rest of the code: * conversion to/from `SpannedToken` * lexing the whole input into a `Vec` and using `iter()` to feed token lifetimes to lalrpop - current state: + WIP parsing use statements (missing support for recursive statements) + unit tests for use statements test lalrpop parser + feature flag `experimental_parser` enables running the experimental parser on every part of the AST that's successfully parsed with the existing chomsky parser Changes: - Now rejecting whitespace other than `\t \n\r` ## Additional Context ## Documentation\* Check one: - [x] No documentation needed. - [ ] Documentation included in this PR. - [ ] **[For Experimental Features]** Documentation to be submitted in a separate PR. # PR Checklist\* - [x] I have tested the changes locally. - [x] I have formatted the changes with [Prettier](https://prettier.io/) and/or `cargo fmt` on default settings. --------- Co-authored-by: Maxim Vezenov <mvezenov@gmail.com> Co-authored-by: Tom French <tom@tomfren.ch>
noir-lang · Apr 15, 2024 · 25ad018 · 25ad018
1 parent 5b23171
commit 25ad018
Show file tree

Hide file tree

Showing 9 changed files with 625 additions and 44 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/compiler/noirc_frontend/Cargo.toml b/compiler/noirc_frontend/Cargo.toml
@@ -24,9 +24,16 @@ small-ord-set = "0.1.3"
 regex = "1.9.1"
 tracing.workspace = true
 petgraph = "0.6"
+lalrpop-util = { version = "0.20.2", features = ["lexer"] }
 
 [dev-dependencies]
 base64.workspace = true
 strum = "0.24"
 strum_macros = "0.24"
 tempfile.workspace = true
+
+[build-dependencies]
+lalrpop = "0.20.2"
+
+[features]
+experimental_parser = []
diff --git a/compiler/noirc_frontend/build.rs b/compiler/noirc_frontend/build.rs
@@ -0,0 +1,28 @@
+use std::fs::{read_to_string, File};
+use std::io::Write;
+
+fn main() {
+    lalrpop::Configuration::new()
+        .emit_rerun_directives(true)
+        .use_cargo_dir_conventions()
+        .process()
+        .unwrap();
+
+    // here, we get a lint error from "extern crate core" so patching that until lalrpop does
+    // (adding cfg directives appears to be unsupported by lalrpop)
+    let out_dir = std::env::var("OUT_DIR").unwrap();
+    let parser_path = std::path::Path::new(&out_dir).join("noir_parser.rs");
+    let content_str = read_to_string(parser_path.clone()).unwrap();
+    let mut parser_file = File::create(parser_path).unwrap();
+    for line in content_str.lines() {
+        if line.contains("extern crate core") {
+            parser_file
+                .write_all(
+                    format!("{}\n", line.replace("extern crate core", "use core")).as_bytes(),
+                )
+                .unwrap();
+        } else {
+            parser_file.write_all(format!("{}\n", line).as_bytes()).unwrap();
+        }
+    }
+}
diff --git a/compiler/noirc_frontend/src/lexer/lexer.rs b/compiler/noirc_frontend/src/lexer/lexer.rs
@@ -2,7 +2,9 @@ use crate::token::{Attribute, DocStyle};
 
 use super::{
     errors::LexerErrorKind,
-    token::{IntType, Keyword, SpannedToken, Token, Tokens},
+    token::{
+        token_to_borrowed_token, BorrowedToken, IntType, Keyword, SpannedToken, Token, Tokens,
+    },
 };
 use acvm::FieldElement;
 use noirc_errors::{Position, Span};
@@ -21,6 +23,21 @@ pub struct Lexer<'a> {
 
 pub type SpannedTokenResult = Result<SpannedToken, LexerErrorKind>;
 
+pub(crate) fn from_spanned_token_result(
+    token_result: &SpannedTokenResult,
+) -> Result<(usize, BorrowedToken<'_>, usize), LexerErrorKind> {
+    token_result
+        .as_ref()
+        .map(|spanned_token| {
+            (
+                spanned_token.to_span().start() as usize,
+                token_to_borrowed_token(spanned_token.into()),
+                spanned_token.to_span().end() as usize,
+            )
+        })
+        .map_err(Clone::clone)
+}
+
 impl<'a> Lexer<'a> {
     /// Given a source file of noir code, return all the tokens in the file
     /// in order, along with any lexing errors that occurred.
@@ -94,7 +111,7 @@ impl<'a> Lexer<'a> {
 
     fn next_token(&mut self) -> SpannedTokenResult {
         match self.next_char() {
-            Some(x) if x.is_whitespace() => {
+            Some(x) if Self::is_code_whitespace(x) => {
                 let spanned = self.eat_whitespace(x);
                 if self.skip_whitespaces {
                     self.next_token()
@@ -560,16 +577,21 @@ impl<'a> Lexer<'a> {
         }
     }
 
+    fn is_code_whitespace(c: char) -> bool {
+        c == '\t' || c == '\n' || c == '\r' || c == ' '
+    }
+
     /// Skips white space. They are not significant in the source language
     fn eat_whitespace(&mut self, initial_char: char) -> SpannedToken {
         let start = self.position;
-        let whitespace = self.eat_while(initial_char.into(), |ch| ch.is_whitespace());
+        let whitespace = self.eat_while(initial_char.into(), Self::is_code_whitespace);
         SpannedToken::new(Token::Whitespace(whitespace), Span::inclusive(start, self.position))
     }
 }
 
 impl<'a> Iterator for Lexer<'a> {
     type Item = SpannedTokenResult;
+
     fn next(&mut self) -> Option<Self::Item> {
         if self.done {
             None
@@ -578,10 +600,12 @@ impl<'a> Iterator for Lexer<'a> {
         }
     }
 }
+
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::token::{FunctionAttribute, SecondaryAttribute, TestScope};
+
     #[test]
     fn test_single_double_char() {
         let input = "! != + ( ) { } [ ] | , ; : :: < <= > >= & - -> . .. % / * = == << >>";