Skip to content

Commit

Permalink
Transcode non-ASCII input to UTF-16 in nsstring!
Browse files Browse the repository at this point in the history
Closes #3.

For safety, the UTF-16 data has a trailing null byte that is not counted
as part of the string's length.

This also allows for interior null bytes, since strings that contain
nulls are encoded as UTF-16.

Co-authored-by: Nikolai Vazquez <hello@nikolaivazquez.com>
Co-authored-by: Thom Chiovoloni <chiovolonit@gmail.com>
Co-authored-by: rodrimati1992 <rodrimatt1985@gmail.com>
  • Loading branch information
3 people committed Sep 15, 2020
1 parent 10b763b commit 868fd65
Show file tree
Hide file tree
Showing 6 changed files with 367 additions and 137 deletions.
16 changes: 16 additions & 0 deletions CHANGELOG.md
Expand Up @@ -12,6 +12,9 @@ The format is based on [Keep a Changelog] and this project adheres to
- The `nsstring!` macro can now take `const X: &str` as input, not just string
literals.

- The `nsstring!` macro now allows interior null bytes, transcoding the string
to UTF-16.

- The `nsstring!` macro now allows trailing null bytes and uses the constant
as-is.

Expand All @@ -35,6 +38,16 @@ The format is based on [Keep a Changelog] and this project adheres to
These are aliased in `foundation` as: `NSPoint`, `NSSize`, `NSRect`, and
`NSRectEdge`.

### Fixed

- The `nsstring!` macro now transcodes non-ASCII strings to UTF-16, instead of
allowing UTF-8 data where only ASCII data is expected.

Transcoding was implemented by [@thomcc]. Iterator technique was provided by
[@rodrimati1992].

See issue [#3].

### Changed

- **\[breaking\]** Increased crate `#[cfg]` strictness from any 32/64 bit to
Expand Down Expand Up @@ -72,6 +85,9 @@ Initial release.
[Keep a Changelog]: http://keepachangelog.com/en/1.0.0/
[Semantic Versioning]: http://semver.org/spec/v2.0.0.html

[@thomcc]: https://github.com/thomcc
[@rodrimati1992]: https://github.com/rodrimati1992

[#3]: https://github.com/nvzqz/fruity/issues/3
[#1]: https://github.com/nvzqz/fruity/issues/1

Expand Down
90 changes: 90 additions & 0 deletions src/_priv/cfstring/mod.rs
@@ -0,0 +1,90 @@
use std::ffi::c_void;

pub mod utf16;

// From `CFString.c`:
// > !!! Note: Constant CFStrings use the bit patterns:
// > C8 (11001000 = default allocator, not inline, not freed contents; 8-bit; has NULL byte; doesn't have length; is immutable)
// > D0 (11010000 = default allocator, not inline, not freed contents; Unicode; is immutable)
// > The bit usages should not be modified in a way that would effect these bit patterns.
//
// The 7 byte is the `CFTypeID` of `CFStringRef`.
const FLAGS_ASCII: usize = 0x07_C8;
const FLAGS_UTF16: usize = 0x07_D0;

#[repr(C)]
pub struct CFStringAscii {
isa: *const c_void,
flags: usize,
data: *const u8,
len: usize,
}

// Required to place in a `static`.
unsafe impl Sync for CFStringAscii {}

impl CFStringAscii {
pub const fn new(isa: *const c_void, data: *const u8, len: usize) -> Self {
Self {
isa,
data,
len,
flags: FLAGS_ASCII,
}
}

pub const fn as_ptr(&self) -> *const c_void {
self as *const Self as *const c_void
}
}

#[repr(C)]
pub struct CFStringUtf16 {
isa: *const c_void,
flags: usize,
data: *const u16,
len: usize,
}

// Required to place in a `static`.
unsafe impl Sync for CFStringUtf16 {}

impl CFStringUtf16 {
pub const fn new(isa: *const c_void, data: *const u16, len: usize) -> Self {
Self {
isa,
data,
len,
flags: FLAGS_UTF16,
}
}

pub const fn as_ptr(&self) -> *const c_void {
self as *const Self as *const c_void
}
}

/// Returns `s` with any 0 byte at the end removed.
pub const fn trim_trailing_nul(s: &str) -> &[u8] {
match s.as_bytes() {
[b @ .., 0] => b,
b => b,
}
}

/// Returns `true` if `bytes` is entirely ASCII with no interior nulls.
pub const fn is_ascii(bytes: &[u8]) -> bool {
let mut i = 0;
loop {
if i == bytes.len() {
return true;
}

let byte = bytes[i];
if !byte.is_ascii() || byte == 0 {
return false;
}

i += 1;
}
}
130 changes: 130 additions & 0 deletions src/_priv/cfstring/utf16.rs
@@ -0,0 +1,130 @@
pub struct Utf16Char {
pub repr: [u16; 2],
pub len: usize,
}

impl Utf16Char {
const fn encode(ch: u32) -> Self {
if ch <= 0xffff {
Self {
repr: [ch as u16, 0],
len: 1,
}
} else {
let payload = ch - 0x10000;
let hi = (payload >> 10) | 0xd800;
let lo = (payload & 0x3ff) | 0xdc00;
Self {
repr: [hi as u16, lo as u16],
len: 2,
}
}
}

#[cfg(test)]
pub fn as_slice(&self) -> &[u16] {
&self.repr[..self.len]
}
}

pub struct EncodeUtf16Iter {
str: &'static [u8],
index: usize,
}

impl EncodeUtf16Iter {
pub const fn new(str: &'static [u8]) -> Self {
Self { str, index: 0 }
}

pub const fn next(self) -> Option<(Self, Utf16Char)> {
if self.index >= self.str.len() {
None
} else {
let (index, ch) = decode_utf8(self.str, self.index);
Some((Self { index, ..self }, Utf16Char::encode(ch)))
}
}
}

// (&str bytes, index) -> (new index, decoded char)
const fn decode_utf8(s: &[u8], i: usize) -> (usize, u32) {
let b0 = s[i];
match b0 {
// one-byte seq
0b0000_0000..=0b0111_1111 => {
let decoded = b0 as u32;
(i + 1, decoded)
}
// two-byte seq
0b1100_0000..=0b1101_1111 => {
let decoded = ((b0 as u32 & 0x1f) << 6) | (s[i + 1] as u32 & 0x3f);
(i + 2, decoded)
}
// 3 byte seq
0b1110_0000..=0b1110_1111 => {
let decoded = ((b0 as u32 & 0x0f) << 12)
| ((s[i + 1] as u32 & 0x3f) << 6)
| (s[i + 2] as u32 & 0x3f);
(i + 3, decoded)
}
// 3 byte seq
0b1111_0000..=0b1111_0111 => {
let decoded = ((b0 as u32 & 0x07) << 18)
| ((s[i + 1] as u32 & 0x3f) << 12)
| ((s[i + 2] as u32 & 0x3f) << 6)
| (s[i + 3] as u32 & 0x3f);
(i + 4, decoded)
}
// continuation bytes, or never-valid bytes.
0b1000_0000..=0b1011_1111 | 0b1111_1000..=0b1111_1111 => {
#[allow(unconditional_panic)]
{
// replace this with unreachable!() when possible in const fn.
const NOT_POSSIBLE_FOR_VALID_UTF8: [(); 0] = [];
let _ = NOT_POSSIBLE_FOR_VALID_UTF8[0];
}
(s.len(), 0xfffd)
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn decode_utf8() {
for c in '\u{0}'..=core::char::MAX {
let mut buf;
for off in 0..4 {
// Ensure we see garbage if we read outside bounds.
buf = [0xff; 8];
let len = c.encode_utf8(&mut buf[off..(off + 4)]).len();
let (end_idx, decoded) = super::decode_utf8(&buf, off);
assert_eq!(
(end_idx, decoded),
(off + len, c as u32),
"failed for U+{code:04X} ({ch:?}) encoded as {buf:#x?} over {range:?}",
code = c as u32,
ch = c,
buf = &buf[off..(off + len)],
range = off..(off + len),
);
}
}
}

#[test]
fn encode_utf16() {
for c in '\u{0}'..=core::char::MAX {
assert_eq!(
c.encode_utf16(&mut [0u16; 2]),
Utf16Char::encode(c as u32).as_slice(),
"failed for U+{:04X} ({:?})",
c as u32,
c
);
}
}
}
4 changes: 4 additions & 0 deletions src/_priv/mod.rs
@@ -0,0 +1,4 @@
pub use std;
pub use std::ffi::c_void;

pub mod cfstring;

0 comments on commit 868fd65

Please sign in to comment.