Fix unicode dot (#376)

* The `.` regex should not take the ASCII fast path see #375 for an example of undefined behavior because of this fast path. TLDR: the ASCII fast path will stop matching on the first matching byte, however this would split multi-byte codepoints. Combined with `Lexer::remaining` (or even just capturing the string like in the issue), this leads to non-utf8 strings escaping into user code. This is UNSOUND. * Add tests for unicode dot in both str and bytes * chore(lib): rewrite using ClassUnicode methods As suggested by @RustyYato * Revert "chore(lib): rewrite using ClassUnicode methods" This reverts commit 80bd23f. * try: fallback to previous impl Tests are still passing * try: add repetition check * chore(lib): cleanup code * fix and move * another fix --------- Co-authored-by: Jérome Eertmans <jeertmans@icloud.com>
maciejhirsz · Feb 16, 2024 · 81f923c · 81f923c
1 parent ba69cc3
commit 81f923c
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 15 deletions.
diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs
@@ -7,7 +7,7 @@ use crate::mir::{Class, ClassUnicode, Literal, Mir};
 
 impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
     pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId {
-        self.parse_mir(mir, then, None, None)
+        self.parse_mir(mir, then, None, None, false)
     }
 
     fn parse_mir(
@@ -16,6 +16,7 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
         then: NodeId,
         miss: Option<NodeId>,
         reserved: Option<ReservedId>,
+        repeated: bool,
     ) -> NodeId {
         match mir {
             Mir::Empty => then,
@@ -29,21 +30,21 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
                     None => self.reserve(),
                 };
 
-                self.parse_mir(*mir, this.get(), Some(miss), Some(this))
+                self.parse_mir(*mir, this.get(), Some(miss), Some(this), true)
             }
             Mir::Maybe(mir) => {
                 let miss = match miss {
                     Some(id) => self.merge(id, then),
                     None => then,
                 };
 
-                self.parse_mir(*mir, then, Some(miss), reserved)
+                self.parse_mir(*mir, then, Some(miss), reserved, true)
             }
             Mir::Alternation(alternation) => {
                 let mut fork = Fork::new().miss(miss);
 
                 for mir in alternation {
-                    let id = self.parse_mir(mir, then, None, None);
+                    let id = self.parse_mir(mir, then, None, None, repeated);
                     let alt = self.fork_off(id);
 
                     fork.merge(alt, self);
@@ -73,7 +74,7 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
                         }
                         None
                     }
-                    Mir::Class(Class::Unicode(class)) if is_one_ascii(&class) => {
+                    Mir::Class(Class::Unicode(class)) if is_one_ascii(&class, repeated) => {
                         cur -= 1;
                         ropebuf[cur] = class.ranges()[0].into();
                         None
@@ -97,7 +98,7 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
 
                 for mir in concat.drain(1..).rev() {
                     if let Some(mir) = handle_bytes(self, mir, &mut then) {
-                        then = self.parse_mir(mir, then, None, None);
+                        then = self.parse_mir(mir, then, None, None, false);
                     }
                 }
 
@@ -107,10 +108,10 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
 
                         self.insert_or_push(reserved, rope)
                     }
-                    Some(mir) => self.parse_mir(mir, then, miss, reserved),
+                    Some(mir) => self.parse_mir(mir, then, miss, reserved, false),
                 }
             }
-            Mir::Class(Class::Unicode(class)) if !is_ascii(&class) => {
+            Mir::Class(Class::Unicode(class)) if !is_ascii(&class, repeated) => {
                 let mut ropes = class
                     .iter()
                     .flat_map(|range| Utf8Sequences::new(range.start(), range.end()))
@@ -160,25 +161,34 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
     }
 }
 
-fn is_ascii(class: &ClassUnicode) -> bool {
-    class.iter().all(|range| {
+/// Return wether current class unicode is ascii.
+///
+/// Because unicode ranges are iterated in increasing order,
+/// it is only necessary to check the last range.
+///
+/// If the check is performed in a repetition,
+/// a fast path is used by checking if end of range is 0x0010_FFFF.
+fn is_ascii(class: &ClassUnicode, repeated: bool) -> bool {
+    class.iter().last().map_or(true, |range| {
         let start = range.start() as u32;
         let end = range.end() as u32;
-
-        start < 128 && (end < 128 || end == 0x0010_FFFF)
+        end < 128 || (repeated && start < 128 && end == 0x0010_FFFF)
     })
 }
 
-fn is_one_ascii(class: &ClassUnicode) -> bool {
+/// Return wether current class unicode is ascii and only contains
+/// one range.
+///
+/// See [`is_ascii`] function for more details.
+fn is_one_ascii(class: &ClassUnicode, repeated: bool) -> bool {
     if class.ranges().len() != 1 {
         return false;
     }
 
     let range = &class.ranges()[0];
     let start = range.start() as u32;
     let end = range.end() as u32;
-
-    start < 128 && (end < 128 || end == 0x0010_FFFF)
+    end < 128 || (repeated && start < 128 && end == 0x0010_FFFF)
 }
 
 #[cfg(test)]

diff --git a/tests/tests/unicode_dot.rs b/tests/tests/unicode_dot.rs
@@ -0,0 +1,56 @@
+use logos::Logos as _;
+use logos_derive::Logos;
+
+#[derive(Logos, Debug, PartialEq)]
+enum TestUnicodeDot {
+    #[regex(".")]
+    Dot,
+}
+
+#[test]
+fn test_unicode_dot_str_ascii() {
+    let mut lexer = TestUnicodeDot::lexer("a");
+    assert_eq!(lexer.next(), Some(Ok(TestUnicodeDot::Dot)));
+    assert_eq!(lexer.remainder(), "");
+    assert_eq!(lexer.next(), None);
+}
+
+#[test]
+fn test_unicode_dot_str_unicode() {
+    let mut lexer = TestUnicodeDot::lexer("");
+    assert_eq!(lexer.next(), Some(Ok(TestUnicodeDot::Dot)));
+    assert_eq!(lexer.remainder(), "");
+    assert_eq!(lexer.next(), None);
+}
+
+#[derive(Logos, Debug, PartialEq)]
+enum TestUnicodeDotBytes {
+    #[regex(".", priority = 100)]
+    Dot,
+    #[regex(b".", priority = 0)]
+    InvalidUtf8,
+}
+
+#[test]
+fn test_unicode_dot_bytes_ascii() {
+    let mut lexer = TestUnicodeDotBytes::lexer(b"a");
+    assert_eq!(lexer.next(), Some(Ok(TestUnicodeDotBytes::Dot)));
+    assert_eq!(lexer.remainder(), b"");
+    assert_eq!(lexer.next(), None);
+}
+
+#[test]
+fn test_unicode_dot_bytes_unicode() {
+    let mut lexer = TestUnicodeDotBytes::lexer("".as_bytes());
+    assert_eq!(lexer.next(), Some(Ok(TestUnicodeDotBytes::Dot)));
+    assert_eq!(lexer.remainder(), b"");
+    assert_eq!(lexer.next(), None);
+}
+
+#[test]
+fn test_unicode_dot_bytes_invalid_utf8() {
+    let mut lexer = TestUnicodeDotBytes::lexer(b"\xff");
+    assert_eq!(lexer.next(), Some(Ok(TestUnicodeDotBytes::InvalidUtf8)));
+    assert_eq!(lexer.remainder(), b"");
+    assert_eq!(lexer.next(), None);
+}