Skip to content

Commit

Permalink
mem: add split_rev
Browse files Browse the repository at this point in the history
Over the last couple of weeks weeks I needed to iterate over a
collection backwards at least twice. Do we want to have this in stdlib?
If yes, click "Merge" and start using today! Free shipping and returns
(before 1.0).

Why is this useful?
-------------------

I need this for building an error wrapper: errors are added in the
wrapper from "lowest" level to "highest" level, and then printed in
reverse order. Imagine `UpdateUsers` call, which needs to return
`error.InvalidInput` and a wrappable error context. In Go we would add a
context to the error when returning it:

    // if update_user fails, add context on which user we are operating
    if err := update_user(user); err != nil {
        return fmt.Errorf("user id=%d: %w", user.id, err)
    }

Since Zig cannot pass anything else than u16 with an error (ziglang#2647), I
will pass a `err_ctx: *Err`, to the callers, where they can, besides
returning an error, augment it with auxiliary data. `Err` is a
preallocated array that can add zero-byte-separated strings. For a
concrete example, imagine such a call graph:

    update_user(User, *Err) error{InvalidInput}!<...>
      validate_user([]const u8, *Err) error{InvalidInput}!<...>

Where `validate_user` would like, besides only the error, signal the
invalid field. And `update_user`, besides the error, would signal the
offending user id.

We also don't want the low-level functions to know in which context they
are operating to construct a meaningful error message: if validation
fails, they append their "context" to the buffer. To translate/augment
the Go example above:

    pub fn validate_user(err_ctx: *Err, user: User) error{InvalidInput}!void {
        if (!ascii.isAlpha(name)) {
            err_ctx.print("name '{s}' must be ascii-letters only", .{name});
            return error.InvalidInput;
        }
        <...>
    }

    // update_user validates each user and does something with it.
    pub fn update_user(err_ctx: *Err, user: User) error{InvalidInput}!void {
        // validate the user before updating it
        validate_user(user) catch {
            err_ctx.print("user id={d}", .{user.id});
            return error.InvalidInput;
        };
        <...>
    }

Then the top-level function (in my case, CLI) will read the buffer
backwards (splitting on `"\x00"`) and print:

    user id=123: name 'žemas' must be ascii-letters only

To read that buffer backwards, dear readers of this commit message, I
need `mem.split_rev`.
  • Loading branch information
motiejus committed Jun 22, 2022
1 parent e44f927 commit b11ba83
Showing 1 changed file with 123 additions and 6 deletions.
129 changes: 123 additions & 6 deletions lib/std/mem.zig
Expand Up @@ -1690,7 +1690,6 @@ test "tokenize (reset)" {
/// If `delimiter` does not exist in buffer,
/// the iterator will return `buffer`, null, in that order.
/// The delimiter length must not be zero.
/// See also the related function `tokenize`.
pub fn split(comptime T: type, buffer: []const T, delimiter: []const T) SplitIterator(T) {
assert(delimiter.len != 0);
return .{
Expand Down Expand Up @@ -1754,11 +1753,100 @@ test "split (multibyte)" {
std.unicode.utf8ToUtf16LeStringLiteral("a, b ,, c, d, e"),
std.unicode.utf8ToUtf16LeStringLiteral(", "),
);
try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("a")));
try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("b ,")));
try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("c")));
try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("d")));
try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("e")));
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("a"));
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("b ,"));
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("c"));
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("d"));
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("e"));
try testing.expect(it16.next() == null);
}

/// Returns an iterator that iterates over the slices of `buffer` that
/// are separated by bytes in `delimiter`, backwards.
/// split(u8, "abc|def||ghi", "|")
/// will return slices for "ghi", "", "def", "abc", null, in that order.
/// If `delimiter` does not exist in buffer,
/// the iterator will return `buffer`, null, in that order.
/// The delimiter length must not be zero.
/// See also the related function `tokenize`.
pub fn split_rev(comptime T: type, buffer: []const T, delimiter: []const T) SplitIteratorRev(T) {
assert(delimiter.len != 0);
return SplitIteratorRev(T){
.index = buffer.len,
.buffer = buffer,
.delimiter = delimiter,
};
}

test "split_rev" {
var it = split_rev(u8, "abc|def||ghi", "|");
try testing.expectEqualSlices(u8, it.rest(), "abc|def||ghi");
try testing.expectEqualSlices(u8, it.next().?, "ghi");

try testing.expectEqualSlices(u8, it.rest(), "abc|def|");
try testing.expectEqualSlices(u8, it.next().?, "");

try testing.expectEqualSlices(u8, it.rest(), "abc|def");
try testing.expectEqualSlices(u8, it.next().?, "def");

try testing.expectEqualSlices(u8, it.rest(), "abc");
try testing.expectEqualSlices(u8, it.next().?, "abc");

try testing.expectEqualSlices(u8, it.rest(), "");
try testing.expect(it.next() == null);

it = split_rev(u8, "", "|");
try testing.expectEqualSlices(u8, it.next().?, "");
try testing.expect(it.next() == null);

it = split_rev(u8, "|", "|");
try testing.expectEqualSlices(u8, it.next().?, "");
try testing.expectEqualSlices(u8, it.next().?, "");
try testing.expect(it.next() == null);

it = split_rev(u8, "hello", " ");
try testing.expectEqualSlices(u8, it.next().?, "hello");
try testing.expect(it.next() == null);

var it16 = split_rev(
u16,
std.unicode.utf8ToUtf16LeStringLiteral("hello"),
std.unicode.utf8ToUtf16LeStringLiteral(" "),
);
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("hello"));
try testing.expect(it16.next() == null);
}

test "split_rev (multibyte)" {
var it = split_rev(u8, "a, b ,, c, d, e", ", ");
try testing.expectEqualSlices(u8, it.rest(), "a, b ,, c, d, e");
try testing.expectEqualSlices(u8, it.next().?, "e");

try testing.expectEqualSlices(u8, it.rest(), "a, b ,, c, d");
try testing.expectEqualSlices(u8, it.next().?, "d");

try testing.expectEqualSlices(u8, it.rest(), "a, b ,, c");
try testing.expectEqualSlices(u8, it.next().?, "c");

try testing.expectEqualSlices(u8, it.rest(), "a, b ,");
try testing.expectEqualSlices(u8, it.next().?, "b ,");

try testing.expectEqualSlices(u8, it.rest(), "a");
try testing.expectEqualSlices(u8, it.next().?, "a");

try testing.expectEqualSlices(u8, it.rest(), "");
try testing.expect(it.next() == null);

var it16 = split_rev(
u16,
std.unicode.utf8ToUtf16LeStringLiteral("a, b ,, c, d, e"),
std.unicode.utf8ToUtf16LeStringLiteral(", "),
);
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("e"));
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("d"));
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("c"));
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("b ,"));
try testing.expectEqualSlices(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("a"));
try testing.expect(it16.next() == null);
}

Expand Down Expand Up @@ -1858,6 +1946,35 @@ pub fn SplitIterator(comptime T: type) type {
};
}

pub fn SplitIteratorRev(comptime T: type) type {
return struct {
buffer: []const T,
index: ?usize,
delimiter: []const T,

const Self = @This();

/// Returns a slice of the next field, or null if splitting is complete.
pub fn next(self: *Self) ?[]const T {
const end = self.index orelse return null;
const start = if (lastIndexOf(T, self.buffer[0..end], self.delimiter)) |delim_start| blk: {
self.index = delim_start;
break :blk delim_start + self.delimiter.len;
} else blk: {
self.index = null;
break :blk 0;
};
return self.buffer[start..end];
}

/// Returns a slice of the remaining bytes. Does not affect iterator state.
pub fn rest(self: Self) []const T {
const end = self.index orelse 0;
return self.buffer[0..end];
}
};
}

/// Naively combines a series of slices with a separator.
/// Allocates memory for the result, which must be freed by the caller.
pub fn join(allocator: Allocator, separator: []const u8, slices: []const []const u8) ![]u8 {
Expand Down

0 comments on commit b11ba83

Please sign in to comment.