Skip to content

Commit

Permalink
Make lexer produce Result<Token, Error> instead of Token. (#273)
Browse files Browse the repository at this point in the history
* Make lexer produce Result<Token, Error> instead of Token.

Remove the requirement of designating an error token.
Remove the `#[error]` attribute.
Using Result instead allows passing the error details downstream.

* Fix post-conflict errors and fmt

---------

Co-authored-by: Maciej Hirsz <1096222+maciejhirsz@users.noreply.github.com>
Co-authored-by: Maciej Hirsz <hello@maciej.codes>
  • Loading branch information
3 people committed Feb 26, 2023
1 parent 56afbaa commit 8ea5cac
Show file tree
Hide file tree
Showing 24 changed files with 830 additions and 721 deletions.
216 changes: 110 additions & 106 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,55 +25,53 @@ To achieve those, **Logos**:
## Example

```rust
use logos::Logos;

#[derive(Logos, Debug, PartialEq)]
enum Token {
// Tokens can be literal strings, of any length.
#[token("fast")]
Fast,

#[token(".")]
Period,

// Or regular expressions.
#[regex("[a-zA-Z]+")]
Text,

// Logos requires one token variant to handle errors,
// it can be named anything you wish.
#[error]
// We can also use this variant to define whitespace,
// or any other matches we wish to skip.
#[regex(r"[ \t\n\f]+", logos::skip)]
Error,
}

fn main() {
let mut lex = Token::lexer("Create ridiculously fast Lexers.");

assert_eq!(lex.next(), Some(Token::Text));
assert_eq!(lex.span(), 0..6);
assert_eq!(lex.slice(), "Create");

assert_eq!(lex.next(), Some(Token::Text));
assert_eq!(lex.span(), 7..19);
assert_eq!(lex.slice(), "ridiculously");

assert_eq!(lex.next(), Some(Token::Fast));
assert_eq!(lex.span(), 20..24);
assert_eq!(lex.slice(), "fast");

assert_eq!(lex.next(), Some(Token::Text));
assert_eq!(lex.span(), 25..31);
assert_eq!(lex.slice(), "Lexers");

assert_eq!(lex.next(), Some(Token::Period));
assert_eq!(lex.span(), 31..32);
assert_eq!(lex.slice(), ".");

assert_eq!(lex.next(), None);
}
use logos::Logos;

#[derive(Logos, Debug, PartialEq)]
enum Token {
// Tokens can be literal strings, of any length.
#[token("fast")]
Fast,

#[token(".")]
Period,

// Or regular expressions.
#[regex("[a-zA-Z]+")]
Text,

// Logos requires one token variant to define whitespace,
// or any other matches we wish to skip.
// It can be named anything you wish.
#[regex(r"[ \t\n\f]+", logos::skip)]
Ignored,
}

fn main() {
let mut lex = Token::lexer("Create ridiculously fast Lexers.");

assert_eq!(lex.next(), Some(Ok(Token::Text)));
assert_eq!(lex.span(), 0..6);
assert_eq!(lex.slice(), "Create");

assert_eq!(lex.next(), Some(Ok(Token::Text)));
assert_eq!(lex.span(), 7..19);
assert_eq!(lex.slice(), "ridiculously");

assert_eq!(lex.next(), Some(Ok(Token::Fast)));
assert_eq!(lex.span(), 20..24);
assert_eq!(lex.slice(), "fast");

assert_eq!(lex.next(), Some(Ok(Token::Text)));
assert_eq!(lex.slice(), "Lexers");
assert_eq!(lex.span(), 25..31);

assert_eq!(lex.next(), Some(Ok(Token::Period)));
assert_eq!(lex.span(), 31..32);
assert_eq!(lex.slice(), ".");

assert_eq!(lex.next(), None);
}
```

### Callbacks
Expand All @@ -82,71 +80,77 @@ fn main() {
which can be used to put data into a variant:

```rust
use logos::{Logos, Lexer};

// Note: callbacks can return `Option` or `Result`
fn kilo(lex: &mut Lexer<Token>) -> Option<u64> {
let slice = lex.slice();
let n: u64 = slice[..slice.len() - 1].parse().ok()?; // skip 'k'
Some(n * 1_000)
}

fn mega(lex: &mut Lexer<Token>) -> Option<u64> {
let slice = lex.slice();
let n: u64 = slice[..slice.len() - 1].parse().ok()?; // skip 'm'
Some(n * 1_000_000)
}

#[derive(Logos, Debug, PartialEq)]
enum Token {
#[error]
#[regex(r"[ \t\n\f]+", logos::skip)]
Error,

// Callbacks can use closure syntax, or refer
// to a function defined elsewhere.
//
// Each pattern can have it's own callback.
#[regex("[0-9]+", |lex| lex.slice().parse())]
#[regex("[0-9]+k", kilo)]
#[regex("[0-9]+M", mega)]
Number(u64),
}

fn main() {
let mut lex = Token::lexer("5 42k 75M");

assert_eq!(lex.next(), Some(Token::Number(5)));
assert_eq!(lex.slice(), "5");

assert_eq!(lex.next(), Some(Token::Number(42_000)));
assert_eq!(lex.slice(), "42k");

assert_eq!(lex.next(), Some(Token::Number(75_000_000)));
assert_eq!(lex.slice(), "75M");

assert_eq!(lex.next(), None);
}
use logos::{Logos, Lexer};

// Note: callbacks can return `Option` or `Result`
fn kilo(lex: &mut Lexer<Token>) -> Option<u64> {
let slice = lex.slice();
let n: u64 = slice[..slice.len() - 1].parse().ok()?; // skip 'k'
Some(n * 1_000)
}

fn mega(lex: &mut Lexer<Token>) -> Option<u64> {
let slice = lex.slice();
let n: u64 = slice[..slice.len() - 1].parse().ok()?; // skip 'm'
Some(n * 1_000_000)
}

#[derive(Logos, Debug, PartialEq)]
enum Token {
#[regex(r"[ \t\n\f]+", logos::skip)]
Ignored,

// Callbacks can use closure syntax, or refer
// to a function defined elsewhere.
//
// Each pattern can have it's own callback.
#[regex("[0-9]+", |lex| lex.slice().parse().ok())]
#[regex("[0-9]+k", kilo)]
#[regex("[0-9]+m", mega)]
Number(u64),
}

fn main() {
let mut lex = Token::lexer("5 42k 75m");

assert_eq!(lex.next(), Some(Ok(Token::Number(5))));
assert_eq!(lex.slice(), "5");

assert_eq!(lex.next(), Some(Ok(Token::Number(42_000))));
assert_eq!(lex.slice(), "42k");

assert_eq!(lex.next(), Some(Ok(Token::Number(75_000_000))));
assert_eq!(lex.slice(), "75m");

assert_eq!(lex.next(), None);
}
```

Logos can handle callbacks with following return types:

| Return type | Produces |
|-----------------------------------|----------------------------------------------------------------------------------|
| `()` | `Token::Unit` |
| `bool` | `Token::Unit` **or** `<Token as Logos>::ERROR` |
| `Result<(), _>` | `Token::Unit` **or** `<Token as Logos>::ERROR` |
| `T` | `Token::Value(T)` |
| `Option<T>` | `Token::Value(T)` **or** `<Token as Logos>::ERROR` |
| `Result<T, _>` | `Token::Value(T)` **or** `<Token as Logos>::ERROR` |
| `Skip` | _skips matched input_ |
| `Filter<T>` | `Token::Value(T)` **or** _skips matched input_ |
| `FilterResult<T>` | `Token::Value(T)` **or** `<Token as Logos>::ERROR>` **or** _skips matched input_ |

| Return type | Produces |
|------------------------|-----------------------------------------------------------------------------------------------------|
| `()` | `Ok(Token::Unit)` |
| `bool` | `Ok(Token::Unit)` **or** `Err(<Token as Logos>::Error::default())` |
| `Result<(), E>` | `Ok(Token::Unit)` **or** `Err(<Token as Logos>::Error::from(err))` |
| `T` | `Ok(Token::Value(T))` |
| `Option<T>` | `Ok(Token::Value(T))` **or** `Err(<Token as Logos>::Error::default())` |
| `Result<T, E>` | `Ok(Token::Value(T))` **or** `Err(<Token as Logos>::Error::from(err))` |
| [`Skip`] | _skips matched input_ |
| [`Filter<T>`] | `Ok(Token::Value(T))` **or** _skips matched input_ |
| [`FilterResult<T, E>`] | `Ok(Token::Value(T))` **or** `Err(<Token as Logos>::Error::from(err))` **or** _skips matched input_ |
Callbacks can be also used to do perform more specialized lexing in place
where regular expressions are too limiting. For specifics look at
`Lexer::remainder` and `Lexer::bump`.

## Errors

By default, **Logos** uses `()` as the error type, which means that it
doesn't store any information about the error.
This can be changed by using `#[logos(error = T)]` attribute on the enum.
The type `T` can be any type that implements `Clone`, `PartialEq`,
`Default` and `From<E>` for each callback's error type `E`.

## Token disambiguation

Rule of thumb is:
Expand Down
5 changes: 2 additions & 3 deletions logos-cli/tests/data/fmt_output.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
#[derive(Debug, Clone, Copy, PartialEq)]
enum Token {
Letter,
Error,
}
impl<'s> ::logos::Logos<'s> for Token {
type Error = ();
type Extras = ();
type Source = str;
const ERROR: Self = Token::Error;
fn lex(lex: &mut ::logos::Lexer<'s, Self>) {
use logos::internal::{CallbackResult, LexerInternal};
type Lexer<'s> = ::logos::Lexer<'s, Token>;
Expand All @@ -20,7 +19,7 @@ impl<'s> ::logos::Logos<'s> for Token {
macro_rules ! _fast_loop { ($ lex : ident , $ test : ident , $ miss : expr) => { while let Some (arr) = $ lex . read :: < & [u8 ; 16] > () { if $ test (arr [0]) { if $ test (arr [1]) { if $ test (arr [2]) { if $ test (arr [3]) { if $ test (arr [4]) { if $ test (arr [5]) { if $ test (arr [6]) { if $ test (arr [7]) { if $ test (arr [8]) { if $ test (arr [9]) { if $ test (arr [10]) { if $ test (arr [11]) { if $ test (arr [12]) { if $ test (arr [13]) { if $ test (arr [14]) { if $ test (arr [15]) { $ lex . bump_unchecked (16) ; continue ; } $ lex . bump_unchecked (15) ; return $ miss ; } $ lex . bump_unchecked (14) ; return $ miss ; } $ lex . bump_unchecked (13) ; return $ miss ; } $ lex . bump_unchecked (12) ; return $ miss ; } $ lex . bump_unchecked (11) ; return $ miss ; } $ lex . bump_unchecked (10) ; return $ miss ; } $ lex . bump_unchecked (9) ; return $ miss ; } $ lex . bump_unchecked (8) ; return $ miss ; } $ lex . bump_unchecked (7) ; return $ miss ; } $ lex . bump_unchecked (6) ; return $ miss ; } $ lex . bump_unchecked (5) ; return $ miss ; } $ lex . bump_unchecked (4) ; return $ miss ; } $ lex . bump_unchecked (3) ; return $ miss ; } $ lex . bump_unchecked (2) ; return $ miss ; } $ lex . bump_unchecked (1) ; return $ miss ; } return $ miss ; } while $ lex . test ($ test) { $ lex . bump_unchecked (1) ; } $ miss } ; }
#[inline]
fn goto1_x<'s>(lex: &mut Lexer<'s>) {
lex.set(Token::Letter);
lex.set(Ok(Token::Letter));
}
#[inline]
fn goto3_at1_with3<'s>(lex: &mut Lexer<'s>) {
Expand Down
2 changes: 0 additions & 2 deletions logos-cli/tests/data/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@
enum Token {
#[regex("a-z")]
Letter,
#[error]
Error,
}
2 changes: 1 addition & 1 deletion logos-cli/tests/data/output.rs
Original file line number Diff line number Diff line change
@@ -1 +1 @@
# [derive (Debug , Clone , Copy , PartialEq)] enum Token { Letter , Error , }impl < 's > :: logos :: Logos < 's > for Token { type Extras = () ; type Source = str ; const ERROR : Self = Token :: Error ; fn lex (lex : & mut :: logos :: Lexer < 's , Self >) { use :: logos :: internal :: { LexerInternal , CallbackResult } ; type Lexer < 's > = :: logos :: Lexer < 's , Token > ; fn _end < 's > (lex : & mut Lexer < 's >) { lex . end () } fn _error < 's > (lex : & mut Lexer < 's >) { lex . bump_unchecked (1) ; lex . error () ; } macro_rules ! _fast_loop { ($ lex : ident , $ test : ident , $ miss : expr) => { while let Some (arr) = $ lex . read :: < & [u8 ; 16] > () { if $ test (arr [0]) { if $ test (arr [1]) { if $ test (arr [2]) { if $ test (arr [3]) { if $ test (arr [4]) { if $ test (arr [5]) { if $ test (arr [6]) { if $ test (arr [7]) { if $ test (arr [8]) { if $ test (arr [9]) { if $ test (arr [10]) { if $ test (arr [11]) { if $ test (arr [12]) { if $ test (arr [13]) { if $ test (arr [14]) { if $ test (arr [15]) { $ lex . bump_unchecked (16) ; continue ; } $ lex . bump_unchecked (15) ; return $ miss ; } $ lex . bump_unchecked (14) ; return $ miss ; } $ lex . bump_unchecked (13) ; return $ miss ; } $ lex . bump_unchecked (12) ; return $ miss ; } $ lex . bump_unchecked (11) ; return $ miss ; } $ lex . bump_unchecked (10) ; return $ miss ; } $ lex . bump_unchecked (9) ; return $ miss ; } $ lex . bump_unchecked (8) ; return $ miss ; } $ lex . bump_unchecked (7) ; return $ miss ; } $ lex . bump_unchecked (6) ; return $ miss ; } $ lex . bump_unchecked (5) ; return $ miss ; } $ lex . bump_unchecked (4) ; return $ miss ; } $ lex . bump_unchecked (3) ; return $ miss ; } $ lex . bump_unchecked (2) ; return $ miss ; } $ lex . bump_unchecked (1) ; return $ miss ; } return $ miss ; } while $ lex . test ($ test) { $ lex . bump_unchecked (1) ; } $ miss } ; } # [inline] fn goto1_x < 's > (lex : & mut Lexer < 's >) { lex . set (Token :: Letter) ; } # [inline] fn goto3_at1_with3 < 's > (lex : & mut Lexer < 's >) { match lex . read_at :: < & [u8 ; 2usize] > (1usize) { Some (b"-z") => { lex . bump_unchecked (3usize) ; goto1_x (lex) } , _ => _error (lex) , } } # [inline] fn goto4 < 's > (lex : & mut Lexer < 's >) { let arr = match lex . read :: < & [u8 ; 3usize] > () { Some (arr) => arr , None => return _end (lex) , } ; match arr [0] { b'a' => goto3_at1_with3 (lex) , _ => _error (lex) , } } goto4 (lex) } }
# [derive (Debug , Clone , Copy , PartialEq)] enum Token { Letter , }impl < 's > :: logos :: Logos < 's > for Token { type Error = () ; type Extras = () ; type Source = str ; fn lex (lex : & mut :: logos :: Lexer < 's , Self >) { use :: logos :: internal :: { LexerInternal , CallbackResult } ; type Lexer < 's > = :: logos :: Lexer < 's , Token > ; fn _end < 's > (lex : & mut Lexer < 's >) { lex . end () } fn _error < 's > (lex : & mut Lexer < 's >) { lex . bump_unchecked (1) ; lex . error () ; } macro_rules ! _fast_loop { ($ lex : ident , $ test : ident , $ miss : expr) => { while let Some (arr) = $ lex . read :: < & [u8 ; 16] > () { if $ test (arr [0]) { if $ test (arr [1]) { if $ test (arr [2]) { if $ test (arr [3]) { if $ test (arr [4]) { if $ test (arr [5]) { if $ test (arr [6]) { if $ test (arr [7]) { if $ test (arr [8]) { if $ test (arr [9]) { if $ test (arr [10]) { if $ test (arr [11]) { if $ test (arr [12]) { if $ test (arr [13]) { if $ test (arr [14]) { if $ test (arr [15]) { $ lex . bump_unchecked (16) ; continue ; } $ lex . bump_unchecked (15) ; return $ miss ; } $ lex . bump_unchecked (14) ; return $ miss ; } $ lex . bump_unchecked (13) ; return $ miss ; } $ lex . bump_unchecked (12) ; return $ miss ; } $ lex . bump_unchecked (11) ; return $ miss ; } $ lex . bump_unchecked (10) ; return $ miss ; } $ lex . bump_unchecked (9) ; return $ miss ; } $ lex . bump_unchecked (8) ; return $ miss ; } $ lex . bump_unchecked (7) ; return $ miss ; } $ lex . bump_unchecked (6) ; return $ miss ; } $ lex . bump_unchecked (5) ; return $ miss ; } $ lex . bump_unchecked (4) ; return $ miss ; } $ lex . bump_unchecked (3) ; return $ miss ; } $ lex . bump_unchecked (2) ; return $ miss ; } $ lex . bump_unchecked (1) ; return $ miss ; } return $ miss ; } while $ lex . test ($ test) { $ lex . bump_unchecked (1) ; } $ miss } ; } # [inline] fn goto1_x < 's > (lex : & mut Lexer < 's >) { lex . set (Ok (Token :: Letter)) ; } # [inline] fn goto3_at1_with3 < 's > (lex : & mut Lexer < 's >) { match lex . read_at :: < & [u8 ; 2usize] > (1usize) { Some (b"-z") => { lex . bump_unchecked (3usize) ; goto1_x (lex) } , _ => _error (lex) , } } # [inline] fn goto4 < 's > (lex : & mut Lexer < 's >) { let arr = match lex . read :: < & [u8 ; 3usize] > () { Some (arr) => arr , None => return _end (lex) , } ; match arr [0] { b'a' => goto3_at1_with3 (lex) , _ => _error (lex) , } } goto4 (lex) } }
4 changes: 2 additions & 2 deletions logos-codegen/src/generator/leaf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ impl<'a> Generator<'a> {
}
None if matches!(leaf.field, MaybeVoid::Void) => quote! {
#bump
lex.set(#name::#ident);
lex.set(Ok(#name::#ident));
},
None => quote! {
#bump
let token = #name::#ident(lex.slice());
lex.set(token);
lex.set(Ok(token));
},
}
}
Expand Down
36 changes: 15 additions & 21 deletions logos-codegen/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ use parser::{Mode, Parser};
use quote::ToTokens;
use util::MaybeVoid;

use proc_macro2::Span;
use proc_macro2::TokenStream;
use quote::quote;
use syn::parse_quote;
Expand All @@ -43,7 +42,6 @@ pub fn generate(input: TokenStream) -> TokenStream {

let name = &item.ident;

let mut error = None;
let mut parser = Parser::default();

for param in item.generics.params {
Expand Down Expand Up @@ -113,12 +111,16 @@ pub fn generate(input: TokenStream) -> TokenStream {

match attr_name.as_str() {
ERROR_ATTR => {
let span = variant.ident.span();
if let Some(previous) = error.replace(&variant.ident) {
parser
.err("Only one #[error] variant can be declared.", span)
.err("Previously declared #[error]:", previous.span());
}
// TODO: Remove in future versions
parser.err(
"\
Since 0.13 Logos no longer requires the #[error] variant.\n\
\n\
For help with migration see release notes: \
https://github.com/maciejhirsz/logos/releases\
",
attr.span(),
);
}
END_ATTR => {
// TODO: Remove in future versions
Expand Down Expand Up @@ -207,6 +209,7 @@ pub fn generate(input: TokenStream) -> TokenStream {

let mut root = Fork::new();

let error_type = parser.error_type.take();
let extras = parser.extras.take();
let source = match parser.mode {
Mode::Utf8 => quote!(str),
Expand All @@ -217,27 +220,19 @@ pub fn generate(input: TokenStream) -> TokenStream {
.take()
.unwrap_or_else(|| parse_quote!(::logos));

let error_def = match error {
Some(error) => Some(quote!(const ERROR: Self = #name::#error;)),
None => {
parser.err("missing #[error] token variant.", Span::call_site());
None
}
};

let generics = parser.generics();
let this = quote!(#name #generics);

let impl_logos = |body| {
quote! {
impl<'s> #logos_path::Logos<'s> for #this {
impl<'s> ::logos::Logos<'s> for #this {
type Error = #error_type;

type Extras = #extras;

type Source = #source;

#error_def

fn lex(lex: &mut #logos_path::Lexer<'s, Self>) {
fn lex(lex: &mut ::logos::Lexer<'s, Self>) {
#body
}
}
Expand Down Expand Up @@ -360,7 +355,6 @@ fn strip_attrs_from_vec(attrs: &mut Vec<syn::Attribute>) {
fn is_logos_attr(attr: &syn::Attribute) -> bool {
attr.path.is_ident(LOGOS_ATTR)
|| attr.path.is_ident(EXTRAS_ATTR)
|| attr.path.is_ident(ERROR_ATTR)
|| attr.path.is_ident(END_ATTR)
|| attr.path.is_ident(TOKEN_ATTR)
|| attr.path.is_ident(REGEX_ATTR)
Expand Down
Loading

0 comments on commit 8ea5cac

Please sign in to comment.