Skip to content

Commit

Permalink
Add UTF-8 compatibility library
Browse files Browse the repository at this point in the history
  • Loading branch information
aidanholm committed Oct 1, 2017
1 parent f2124f5 commit 1fcb990
Show file tree
Hide file tree
Showing 9 changed files with 256 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -24,6 +24,7 @@
- Added `webview` widget `allow_file_access_from_file_urls` and `allow_universal_access_from_file_urls` properties.
- Added `settings` module and APIs. This replaces the `domain_props` module.
- Added `tablist.always_visible` setting.
- Added `utf8.len` (same as `string.wlen`) and `utf8.offset` methods.

### Changed

Expand Down
132 changes: 132 additions & 0 deletions common/clib/utf8.c
@@ -0,0 +1,132 @@
/*
* common/clib/utf8.c - Basic UTF8 character counting (wrapper for glib)
*
* Copyright © 2017 Dennis Hofheinz <github@kjdf.de>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/

#include "common/clib/utf8.h"
#include "luah.h"

#include <glib.h>

/* Convert 1-based into 0-based byte offset,
* counted from back of string if negative
* return (size_t) -1 if offset is out of range */
static size_t
abspos(ssize_t offset, size_t length) {
if (offset == 0)
return (size_t) -1;
offset = (offset > 0) ? offset - 1 : offset + (ssize_t) length;
if (offset < 0 || (size_t) offset > length)
return (size_t) -1;
return (size_t) offset;
}

/* UTF8 aware string length computing.
* Returns the number of elements pushed on the stack. */
static gint
luaH_utf8_len(lua_State *L)
{
size_t blen;
const gchar *str = luaL_checklstring(L, 1, &blen);

/* parse optional begin/end parameters
* raise an error if out of bounds */
size_t bbeg = abspos(luaL_optinteger(L, 2, 1), blen);
luaL_argcheck(L, bbeg != (size_t) -1, 2, "initial position out of string");
/* setting end position requires extra work to imitate Lua 5.3 */
size_t bend = bbeg;
ssize_t sbend = luaL_optinteger(L, 3, blen); /* may be negative */
sbend = (sbend >= 0) ? sbend - 1 : sbend + (ssize_t) blen;
luaL_argcheck(L, sbend < (ssize_t) blen, 3, "final position out of string");
if (sbend >= (ssize_t) bbeg && (size_t) sbend < blen)
bend = g_utf8_find_next_char(str + (size_t) sbend, NULL) - str;

/* is the string valid UTF8? */
gchar *valend;
if (!g_utf8_validate(str + bbeg, bend - bbeg, (const gchar **) &valend)) {
lua_pushnil(L);
lua_pushinteger(L, (ssize_t) (valend - str) + 1);
return 2;
}

lua_pushinteger(L, (ssize_t) g_utf8_strlen(str + bbeg, bend - bbeg));
return 1;
}

/* UTF8 aware string offset conversion.
* Converts (1-based) UTF8 offset to (1-based) byte offset.
* Returns the number of elements pushed on the stack. */
static gint
luaH_utf8_offset(lua_State *L)
{
size_t blen;
const gchar *str = luaL_checklstring(L, 1, &blen);
ssize_t widx = luaL_checkinteger(L, 2);
if (widx > 0) widx--; /* adjust to 0-based */

/* parse optional parameter (base index)
* raise an error if out of bounds
* or if initial position points inside a UTF8 encoding */
size_t bbase;
bbase = luaL_optinteger(L, 3, (widx>=0) ? 1 : blen + 1);
bbase = abspos(bbase, blen);
luaL_argcheck(L, bbase != (size_t) -1, 3, "position out of range");
if (g_utf8_get_char_validated(str + bbase, -1) == (gunichar) -1)
luaL_error(L, "initial position is a continuation byte");

/* convert negative index parameter to positive */
size_t wseglen;
size_t bbeg = 0;
if (widx < 0) {
wseglen = g_utf8_strlen(str, bbase);
widx += wseglen;
} else {
wseglen = g_utf8_strlen(str + bbase, blen - bbase);
bbeg = bbase;
}

/* convert positive UTF8 offset to byte offset */
ssize_t ret = 0;
if (widx >= 0 && (size_t) widx <= wseglen) {
gchar *pos = g_utf8_offset_to_pointer(str + bbeg, widx);
if (pos != NULL)
ret = (ssize_t) (pos - str) + 1;
}

/* if conversion was successful, output result (else output nil) */
if (ret > 0)
lua_pushinteger(L, ret);
else
lua_pushnil(L);
return 1;
}

void
utf8_lib_setup(lua_State *L)
{
static const struct luaL_Reg utf8_lib[] =
{
{ "len", luaH_utf8_len },
{ "offset", luaH_utf8_offset },
{ NULL, NULL }
};

luaH_openlib(L, "utf8", utf8_lib, utf8_lib);
}

// vim: ft=c:et:sw=4:ts=8:sts=4:tw=80
30 changes: 30 additions & 0 deletions common/clib/utf8.h
@@ -0,0 +1,30 @@
/*
* common/clib/utf8.h - UTF8 class header
*
* Copyright © 2017 Dennis Hofheinz <github@kjdf.de>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/

#ifndef LUAKIT_COMMON_CLIB_UTF8_H
#define LUAKIT_COMMON_CLIB_UTF8_H

#include <lua.h>

void utf8_lib_setup(lua_State *);

#endif

// vim: ft=c:et:sw=4:ts=8:sts=4:tw=80
40 changes: 40 additions & 0 deletions doc/luadoc/utf8.lua
@@ -0,0 +1,40 @@
--- Basic UTF8 character counting support for Luakit.
--
-- This module provides a partial implementation of the Lua 5.3 UTF-8 library.
--
-- @module utf8
-- @author Dennis Hofheinz
-- @copyright 2017 Dennis Hofheinz <github@kjdf.de>

--- @function len
-- Return the number of characters (not bytes) of a UTF-8-encoded string.
--
-- If the optional parameters `begin` and/or `end` are given, then characters within `s` will only be counted if they begin between positions `begin` and `end` (both inclusive).
--
-- An error is raised if `s` (or the characters that start in the slice from `begin` to `end`) contains invalid UTF8 characters, of if `begin` or `end` point to byte indices not in `s`.
--
-- @tparam string s The string whose length is to be returned.
-- @tparam[opt] integer begin Only consider `s` from (1-based byte) index `begin` onwards. If negative, count from `end` of `s` (with -1 being the last byte).
-- @default 1
-- @tparam[opt] integer end Only consider `s` up to and including (1-based byte) index `end`. If negative, count from `end` of `s` (with -1 being the last byte).
-- @default -1
-- @treturn integer The length (in UTF8 characters) of `s`.

--- @function offset
-- Convert an offset (in UTF8 characters) to a byte offset.
--
-- If optional parameter `base` is given and positive, count characters starting from (byte) index `base`.
--
-- An error is raised if base is smaller than `1` or larger than the (byte) length of `string`, or if `base` points to a byte inside `string` that is not the starting byte of a UTF8 encoding.
--
-- # Examples
--
-- - `utf8.offset("abc",2,2)` would return `3`
-- - `utf8.offset("abc",-3)` would return `1`
--
-- @tparam string string The string in which offsets should be converted.
-- @tparam integer woffset The offset (1-based, in UTF8 characters) which should be converted.
-- @tparam[opt] integer base A (1-based byte) index in `string`. Defaults to 1 if `woffset` is positive, and to the (byte) length of `string` if `woffset` is negative. See the description above.
-- @treturn integer The (1-based) byte offset of the `woffset`-th UTF8 character in `string`.

-- vim: et:sw=4:ts=8:sts=4:tw=80
2 changes: 2 additions & 0 deletions extension/extension.c
Expand Up @@ -37,6 +37,7 @@
#include "common/clib/ipc.h"
#include "common/clib/timer.h"
#include "common/clib/regex.h"
#include "common/clib/utf8.h"

#include "extension/scroll.h"
#include "extension/luajs.h"
Expand Down Expand Up @@ -69,6 +70,7 @@ web_lua_init(const char *package_path, const char *package_cpath)
ipc_channel_class_setup(L);
timer_class_setup(L);
regex_class_setup(L);
utf8_lib_setup(L);
dom_document_class_setup(L);
dom_element_class_setup(L);
page_class_setup(L);
Expand Down
1 change: 1 addition & 0 deletions lib/help_chrome.lua
Expand Up @@ -90,6 +90,7 @@ local builtin_module_set = {
luakit = true,
msg = true,
soup = true,
utf8 = true,
}

local help_doc_index_page_preprocess = function (inner, style)
Expand Down
4 changes: 4 additions & 0 deletions luah.c
Expand Up @@ -41,6 +41,7 @@
#include "common/clib/ipc.h"
#include "common/clib/timer.h"
#include "common/clib/regex.h"
#include "common/clib/utf8.h"
#include "globalconf.h"

#include <glib.h>
Expand Down Expand Up @@ -134,6 +135,9 @@ luaH_init(gchar ** uris)
/* Export regex */
regex_class_setup(L);

/* Export utf8 */
utf8_lib_setup(L);

/* Export request */
request_class_setup(L);

Expand Down
45 changes: 45 additions & 0 deletions tests/async/test_clib_utf8.lua
@@ -0,0 +1,45 @@
--- Test utf8 clib functionality.
--
-- @copyright 2017 Dennis Hofheinz <github@kjdf.de>

local assert = require "luassert"

local T = {}

T.test_module = function ()
assert.is_table(utf8)
end

T.test_utf8_len = function ()
assert.equal(0, utf8.len(""))
assert.equal(1, utf8.len("ä"))
assert.equal(2, utf8.len("äa"))
assert.equal(1, utf8.len("äa", -1))
assert.equal(2, utf8.len("äa", -3))
assert.equal(1, utf8.len("äa", 1, 1))
assert.equal(1, utf8.len("äa", 1, 2))
assert.equal(2, utf8.len("äa", 1, 3))
-- corner cases and errors
assert.equal(0, utf8.len("", 1, 0))
assert.equal(0, utf8.len("äa", 4))
assert.equal(0, utf8.len("äa", 3, 2))
assert.has.errors(function() utf8.len("", 1, 1) end)
assert.has.errors(function() utf8.len("äa", 0) end)
assert.has.errors(function() utf8.len("äa", 5) end)
end

T.test_utf8_offset = function ()
assert.equal(1, utf8.offset("äaäaä", 1))
assert.equal(3, utf8.offset("äaäaä", 2))
assert.equal(7, utf8.offset("äaäaä", 5))
assert.equal(9, utf8.offset("äaäaä", 6))
assert.equal(4, utf8.offset("äaäaä", 2, 3))
assert.equal(7, utf8.offset("äaäaä", 2, -3))
-- corner cases and errors
assert.has.errors(function() utf8.offset("äaäaä", 1, 2) end)
assert.has.errors(function() utf8.offset("äaäaä", 1, 5) end)
end

return T

-- vim: et:sw=4:ts=8:sts=4:tw=80
1 change: 1 addition & 0 deletions tests/style/test_luacheck.lua
Expand Up @@ -19,6 +19,7 @@ function T.test_luacheck ()
"ipc_channel",
"string.wlen",
"regex",
"utf8",
}
local ui_globals = {
"sqlite3",
Expand Down

0 comments on commit 1fcb990

Please sign in to comment.