Transcode non-ASCII input to UTF-16 in nsstring!

Closes #3. For safety, the UTF-16 data has a trailing null byte that is not counted as part of the string's length. This also allows for interior null bytes, since strings that contain nulls are encoded as UTF-16. Co-authored-by: Nikolai Vazquez <hello@nikolaivazquez.com> Co-authored-by: Thom Chiovoloni <chiovolonit@gmail.com> Co-authored-by: rodrimati1992 <rodrimatt1985@gmail.com>
nvzqz · Sep 15, 2020 · 868fd65 · 868fd65
1 parent 10b763b
commit 868fd65
Show file tree

Hide file tree

Showing 6 changed files with 367 additions and 137 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,9 @@ The format is based on [Keep a Changelog] and this project adheres to
 - The `nsstring!` macro can now take `const X: &str` as input, not just string
   literals.
 
+- The `nsstring!` macro now allows interior null bytes, transcoding the string
+  to UTF-16.
+
 - The `nsstring!` macro now allows trailing null bytes and uses the constant
   as-is.
 
@@ -35,6 +38,16 @@ The format is based on [Keep a Changelog] and this project adheres to
     These are aliased in `foundation` as: `NSPoint`, `NSSize`, `NSRect`, and
     `NSRectEdge`.
 
+### Fixed
+
+- The `nsstring!` macro now transcodes non-ASCII strings to UTF-16, instead of
+  allowing UTF-8 data where only ASCII data is expected.
+
+  Transcoding was implemented by [@thomcc]. Iterator technique was provided by
+  [@rodrimati1992].
+
+  See issue [#3].
+
 ### Changed
 
 - **\[breaking\]** Increased crate `#[cfg]` strictness from any 32/64 bit to
@@ -72,6 +85,9 @@ Initial release.
 [Keep a Changelog]:    http://keepachangelog.com/en/1.0.0/
 [Semantic Versioning]: http://semver.org/spec/v2.0.0.html
 
+[@thomcc]: https://github.com/thomcc
+[@rodrimati1992]: https://github.com/rodrimati1992
+
 [#3]: https://github.com/nvzqz/fruity/issues/3
 [#1]: https://github.com/nvzqz/fruity/issues/1
 

diff --git a/src/_priv/cfstring/mod.rs b/src/_priv/cfstring/mod.rs
@@ -0,0 +1,90 @@
+use std::ffi::c_void;
+
+pub mod utf16;
+
+// From `CFString.c`:
+// > !!! Note: Constant CFStrings use the bit patterns:
+// > C8 (11001000 = default allocator, not inline, not freed contents; 8-bit; has NULL byte; doesn't have length; is immutable)
+// > D0 (11010000 = default allocator, not inline, not freed contents; Unicode; is immutable)
+// > The bit usages should not be modified in a way that would effect these bit patterns.
+//
+// The 7 byte is the `CFTypeID` of `CFStringRef`.
+const FLAGS_ASCII: usize = 0x07_C8;
+const FLAGS_UTF16: usize = 0x07_D0;
+
+#[repr(C)]
+pub struct CFStringAscii {
+    isa: *const c_void,
+    flags: usize,
+    data: *const u8,
+    len: usize,
+}
+
+// Required to place in a `static`.
+unsafe impl Sync for CFStringAscii {}
+
+impl CFStringAscii {
+    pub const fn new(isa: *const c_void, data: *const u8, len: usize) -> Self {
+        Self {
+            isa,
+            data,
+            len,
+            flags: FLAGS_ASCII,
+        }
+    }
+
+    pub const fn as_ptr(&self) -> *const c_void {
+        self as *const Self as *const c_void
+    }
+}
+
+#[repr(C)]
+pub struct CFStringUtf16 {
+    isa: *const c_void,
+    flags: usize,
+    data: *const u16,
+    len: usize,
+}
+
+// Required to place in a `static`.
+unsafe impl Sync for CFStringUtf16 {}
+
+impl CFStringUtf16 {
+    pub const fn new(isa: *const c_void, data: *const u16, len: usize) -> Self {
+        Self {
+            isa,
+            data,
+            len,
+            flags: FLAGS_UTF16,
+        }
+    }
+
+    pub const fn as_ptr(&self) -> *const c_void {
+        self as *const Self as *const c_void
+    }
+}
+
+/// Returns `s` with any 0 byte at the end removed.
+pub const fn trim_trailing_nul(s: &str) -> &[u8] {
+    match s.as_bytes() {
+        [b @ .., 0] => b,
+        b => b,
+    }
+}
+
+/// Returns `true` if `bytes` is entirely ASCII with no interior nulls.
+pub const fn is_ascii(bytes: &[u8]) -> bool {
+    let mut i = 0;
+    loop {
+        if i == bytes.len() {
+            return true;
+        }
+
+        let byte = bytes[i];
+        if !byte.is_ascii() || byte == 0 {
+            return false;
+        }
+
+        i += 1;
+    }
+}
diff --git a/src/_priv/cfstring/utf16.rs b/src/_priv/cfstring/utf16.rs
@@ -0,0 +1,130 @@
+pub struct Utf16Char {
+    pub repr: [u16; 2],
+    pub len: usize,
+}
+
+impl Utf16Char {
+    const fn encode(ch: u32) -> Self {
+        if ch <= 0xffff {
+            Self {
+                repr: [ch as u16, 0],
+                len: 1,
+            }
+        } else {
+            let payload = ch - 0x10000;
+            let hi = (payload >> 10) | 0xd800;
+            let lo = (payload & 0x3ff) | 0xdc00;
+            Self {
+                repr: [hi as u16, lo as u16],
+                len: 2,
+            }
+        }
+    }
+
+    #[cfg(test)]
+    pub fn as_slice(&self) -> &[u16] {
+        &self.repr[..self.len]
+    }
+}
+
+pub struct EncodeUtf16Iter {
+    str: &'static [u8],
+    index: usize,
+}
+
+impl EncodeUtf16Iter {
+    pub const fn new(str: &'static [u8]) -> Self {
+        Self { str, index: 0 }
+    }
+
+    pub const fn next(self) -> Option<(Self, Utf16Char)> {
+        if self.index >= self.str.len() {
+            None
+        } else {
+            let (index, ch) = decode_utf8(self.str, self.index);
+            Some((Self { index, ..self }, Utf16Char::encode(ch)))
+        }
+    }
+}
+
+// (&str bytes, index) -> (new index, decoded char)
+const fn decode_utf8(s: &[u8], i: usize) -> (usize, u32) {
+    let b0 = s[i];
+    match b0 {
+        // one-byte seq
+        0b0000_0000..=0b0111_1111 => {
+            let decoded = b0 as u32;
+            (i + 1, decoded)
+        }
+        // two-byte seq
+        0b1100_0000..=0b1101_1111 => {
+            let decoded = ((b0 as u32 & 0x1f) << 6) | (s[i + 1] as u32 & 0x3f);
+            (i + 2, decoded)
+        }
+        // 3 byte seq
+        0b1110_0000..=0b1110_1111 => {
+            let decoded = ((b0 as u32 & 0x0f) << 12)
+                | ((s[i + 1] as u32 & 0x3f) << 6)
+                | (s[i + 2] as u32 & 0x3f);
+            (i + 3, decoded)
+        }
+        // 3 byte seq
+        0b1111_0000..=0b1111_0111 => {
+            let decoded = ((b0 as u32 & 0x07) << 18)
+                | ((s[i + 1] as u32 & 0x3f) << 12)
+                | ((s[i + 2] as u32 & 0x3f) << 6)
+                | (s[i + 3] as u32 & 0x3f);
+            (i + 4, decoded)
+        }
+        // continuation bytes, or never-valid bytes.
+        0b1000_0000..=0b1011_1111 | 0b1111_1000..=0b1111_1111 => {
+            #[allow(unconditional_panic)]
+            {
+                // replace this with unreachable!() when possible in const fn.
+                const NOT_POSSIBLE_FOR_VALID_UTF8: [(); 0] = [];
+                let _ = NOT_POSSIBLE_FOR_VALID_UTF8[0];
+            }
+            (s.len(), 0xfffd)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn decode_utf8() {
+        for c in '\u{0}'..=core::char::MAX {
+            let mut buf;
+            for off in 0..4 {
+                // Ensure we see garbage if we read outside bounds.
+                buf = [0xff; 8];
+                let len = c.encode_utf8(&mut buf[off..(off + 4)]).len();
+                let (end_idx, decoded) = super::decode_utf8(&buf, off);
+                assert_eq!(
+                    (end_idx, decoded),
+                    (off + len, c as u32),
+                    "failed for U+{code:04X} ({ch:?}) encoded as {buf:#x?} over {range:?}",
+                    code = c as u32,
+                    ch = c,
+                    buf = &buf[off..(off + len)],
+                    range = off..(off + len),
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn encode_utf16() {
+        for c in '\u{0}'..=core::char::MAX {
+            assert_eq!(
+                c.encode_utf16(&mut [0u16; 2]),
+                Utf16Char::encode(c as u32).as_slice(),
+                "failed for U+{:04X} ({:?})",
+                c as u32,
+                c
+            );
+        }
+    }
+}
diff --git a/src/_priv/mod.rs b/src/_priv/mod.rs
@@ -0,0 +1,4 @@
+pub use std;
+pub use std::ffi::c_void;
+
+pub mod cfstring;