Add UTF-8 compatibility library

luakit · Oct 1, 2017 · 1fcb990 · 1fcb990
1 parent f2124f5
commit 1fcb990
Show file tree

Hide file tree

Showing 9 changed files with 256 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@
  - Added `webview` widget `allow_file_access_from_file_urls` and `allow_universal_access_from_file_urls` properties.
  - Added `settings` module and APIs. This replaces the `domain_props` module.
  - Added `tablist.always_visible` setting.
+ - Added `utf8.len` (same as `string.wlen`) and `utf8.offset` methods.
 
 ### Changed
 

diff --git a/common/clib/utf8.c b/common/clib/utf8.c
@@ -0,0 +1,132 @@
+/*
+ * common/clib/utf8.c - Basic UTF8 character counting (wrapper for glib)
+ *
+ * Copyright © 2017 Dennis Hofheinz <github@kjdf.de>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "common/clib/utf8.h"
+#include "luah.h"
+
+#include <glib.h>
+
+/* Convert 1-based into 0-based byte offset,
+ * counted from back of string if negative
+ * return (size_t) -1 if offset is out of range */
+static size_t
+abspos(ssize_t offset, size_t length) {
+    if (offset == 0)
+        return (size_t) -1;
+    offset = (offset > 0) ? offset - 1 : offset + (ssize_t) length;
+    if (offset < 0 || (size_t) offset > length)
+        return (size_t) -1;
+    return (size_t) offset;
+}
+
+/* UTF8 aware string length computing.
+ * Returns the number of elements pushed on the stack. */
+static gint
+luaH_utf8_len(lua_State *L)
+{
+    size_t blen;
+    const gchar *str = luaL_checklstring(L, 1, &blen);
+
+    /* parse optional begin/end parameters
+     * raise an error if out of bounds */
+    size_t bbeg = abspos(luaL_optinteger(L, 2, 1), blen);
+    luaL_argcheck(L, bbeg != (size_t) -1, 2, "initial position out of string");
+    /* setting end position requires extra work to imitate Lua 5.3 */
+    size_t bend = bbeg;
+    ssize_t sbend = luaL_optinteger(L, 3, blen); /* may be negative */
+    sbend = (sbend >= 0) ? sbend - 1 : sbend + (ssize_t) blen;
+    luaL_argcheck(L, sbend < (ssize_t) blen, 3, "final position out of string");
+    if (sbend >= (ssize_t) bbeg && (size_t) sbend < blen)
+        bend = g_utf8_find_next_char(str + (size_t) sbend, NULL) - str;
+
+    /* is the string valid UTF8? */
+    gchar *valend;
+    if (!g_utf8_validate(str + bbeg, bend - bbeg, (const gchar **) &valend)) {
+        lua_pushnil(L);
+        lua_pushinteger(L, (ssize_t) (valend - str) + 1);
+        return 2;
+    }
+
+    lua_pushinteger(L, (ssize_t) g_utf8_strlen(str + bbeg, bend - bbeg));
+    return 1;
+}
+
+/* UTF8 aware string offset conversion.
+ * Converts (1-based) UTF8 offset to (1-based) byte offset.
+ * Returns the number of elements pushed on the stack. */
+static gint
+luaH_utf8_offset(lua_State *L)
+{
+    size_t blen;
+    const gchar *str = luaL_checklstring(L, 1, &blen);
+    ssize_t widx = luaL_checkinteger(L, 2);
+    if (widx > 0) widx--; /* adjust to 0-based */
+
+    /* parse optional parameter (base index)
+     * raise an error if out of bounds
+     * or if initial position points inside a UTF8 encoding */
+    size_t bbase;
+    bbase = luaL_optinteger(L, 3, (widx>=0) ? 1 : blen + 1);
+    bbase = abspos(bbase, blen);
+    luaL_argcheck(L, bbase != (size_t) -1, 3, "position out of range");
+    if (g_utf8_get_char_validated(str + bbase, -1) == (gunichar) -1)
+        luaL_error(L, "initial position is a continuation byte");
+
+    /* convert negative index parameter to positive */
+    size_t wseglen;
+    size_t bbeg = 0;
+    if (widx < 0) {
+        wseglen = g_utf8_strlen(str, bbase);
+        widx += wseglen;
+    } else {
+        wseglen = g_utf8_strlen(str + bbase, blen - bbase);
+        bbeg = bbase;
+    }
+
+    /* convert positive UTF8 offset to byte offset */
+    ssize_t ret = 0;
+    if (widx >= 0 && (size_t) widx <= wseglen) {
+        gchar *pos = g_utf8_offset_to_pointer(str + bbeg, widx);
+        if (pos != NULL)
+            ret = (ssize_t) (pos - str) + 1;
+    }
+
+    /* if conversion was successful, output result (else output nil) */
+    if (ret > 0)
+        lua_pushinteger(L, ret);
+    else
+        lua_pushnil(L);
+    return 1;
+}
+
+void
+utf8_lib_setup(lua_State *L)
+{
+    static const struct luaL_Reg utf8_lib[] =
+    {
+        { "len", luaH_utf8_len },
+        { "offset", luaH_utf8_offset },
+        { NULL, NULL }
+    };
+
+    luaH_openlib(L, "utf8", utf8_lib, utf8_lib);
+}
+
+// vim: ft=c:et:sw=4:ts=8:sts=4:tw=80
diff --git a/common/clib/utf8.h b/common/clib/utf8.h
@@ -0,0 +1,30 @@
+/*
+ * common/clib/utf8.h - UTF8 class header
+ *
+ * Copyright © 2017 Dennis Hofheinz <github@kjdf.de>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef LUAKIT_COMMON_CLIB_UTF8_H
+#define LUAKIT_COMMON_CLIB_UTF8_H
+
+#include <lua.h>
+
+void utf8_lib_setup(lua_State *);
+
+#endif
+
+// vim: ft=c:et:sw=4:ts=8:sts=4:tw=80
diff --git a/doc/luadoc/utf8.lua b/doc/luadoc/utf8.lua
@@ -0,0 +1,40 @@
+--- Basic UTF8 character counting support for Luakit.
+--
+-- This module provides a partial implementation of the Lua 5.3 UTF-8 library.
+--
+-- @module utf8
+-- @author Dennis Hofheinz
+-- @copyright 2017 Dennis Hofheinz <github@kjdf.de>
+
+--- @function len
+-- Return the number of characters (not bytes) of a UTF-8-encoded string.
+--
+-- If the optional parameters `begin` and/or `end` are given, then characters within `s` will only be counted if they begin between positions `begin` and `end` (both inclusive).
+--
+-- An error is raised if `s` (or the characters that start in the slice from `begin` to `end`) contains invalid UTF8 characters, of if `begin` or `end` point to byte indices not in `s`.
+--
+-- @tparam string s The string whose length is to be returned.
+-- @tparam[opt] integer begin Only consider `s` from (1-based byte) index `begin` onwards. If negative, count from `end` of `s` (with -1 being the last byte).
+-- @default 1
+-- @tparam[opt] integer end Only consider `s` up to and including (1-based byte) index `end`. If negative, count from `end` of `s` (with -1 being the last byte).
+-- @default -1
+-- @treturn integer The length (in UTF8 characters) of `s`.
+
+--- @function offset
+-- Convert an offset (in UTF8 characters) to a byte offset.
+--
+-- If optional parameter `base` is given and positive, count characters starting from (byte) index `base`.
+--
+-- An error is raised if base is smaller than `1` or larger than the (byte) length of `string`, or if `base` points to a byte inside `string` that is not the starting byte of a UTF8 encoding.
+--
+-- # Examples
+--
+--  - `utf8.offset("abc",2,2)` would return `3`
+--  - `utf8.offset("abc",-3)` would return `1`
+--
+-- @tparam string string The string in which offsets should be converted.
+-- @tparam integer woffset The offset (1-based, in UTF8 characters) which should be converted.
+-- @tparam[opt] integer base A (1-based byte) index in `string`. Defaults to 1 if `woffset` is positive, and to the (byte) length of `string` if `woffset` is negative. See the description above.
+-- @treturn integer The (1-based) byte offset of the `woffset`-th UTF8 character in `string`.
+
+-- vim: et:sw=4:ts=8:sts=4:tw=80
diff --git a/extension/extension.c b/extension/extension.c
@@ -37,6 +37,7 @@
 #include "common/clib/ipc.h"
 #include "common/clib/timer.h"
 #include "common/clib/regex.h"
+#include "common/clib/utf8.h"
 
 #include "extension/scroll.h"
 #include "extension/luajs.h"
@@ -69,6 +70,7 @@ web_lua_init(const char *package_path, const char *package_cpath)
     ipc_channel_class_setup(L);
     timer_class_setup(L);
     regex_class_setup(L);
+    utf8_lib_setup(L);
     dom_document_class_setup(L);
     dom_element_class_setup(L);
     page_class_setup(L);

diff --git a/lib/help_chrome.lua b/lib/help_chrome.lua
@@ -90,6 +90,7 @@ local builtin_module_set = {
     luakit = true,
     msg = true,
     soup = true,
+    utf8 = true,
 }
 
 local help_doc_index_page_preprocess = function (inner, style)

diff --git a/luah.c b/luah.c
@@ -41,6 +41,7 @@
 #include "common/clib/ipc.h"
 #include "common/clib/timer.h"
 #include "common/clib/regex.h"
+#include "common/clib/utf8.h"
 #include "globalconf.h"
 
 #include <glib.h>
@@ -134,6 +135,9 @@ luaH_init(gchar ** uris)
     /* Export regex */
     regex_class_setup(L);
 
+    /* Export utf8 */
+    utf8_lib_setup(L);
+
     /* Export request */
     request_class_setup(L);
 

diff --git a/tests/async/test_clib_utf8.lua b/tests/async/test_clib_utf8.lua
@@ -0,0 +1,45 @@
+--- Test utf8 clib functionality.
+--
+-- @copyright 2017 Dennis Hofheinz <github@kjdf.de>
+
+local assert = require "luassert"
+
+local T = {}
+
+T.test_module = function ()
+    assert.is_table(utf8)
+end
+
+T.test_utf8_len = function ()
+    assert.equal(0, utf8.len(""))
+    assert.equal(1, utf8.len("ä"))
+    assert.equal(2, utf8.len("äa"))
+    assert.equal(1, utf8.len("äa", -1))
+    assert.equal(2, utf8.len("äa", -3))
+    assert.equal(1, utf8.len("äa", 1, 1))
+    assert.equal(1, utf8.len("äa", 1, 2))
+    assert.equal(2, utf8.len("äa", 1, 3))
+    -- corner cases and errors
+    assert.equal(0, utf8.len("", 1, 0))
+    assert.equal(0, utf8.len("äa", 4))
+    assert.equal(0, utf8.len("äa", 3, 2))
+    assert.has.errors(function() utf8.len("", 1, 1) end)
+    assert.has.errors(function() utf8.len("äa", 0) end)
+    assert.has.errors(function() utf8.len("äa", 5) end)
+end
+
+T.test_utf8_offset = function ()
+    assert.equal(1, utf8.offset("äaäaä", 1))
+    assert.equal(3, utf8.offset("äaäaä", 2))
+    assert.equal(7, utf8.offset("äaäaä", 5))
+    assert.equal(9, utf8.offset("äaäaä", 6))
+    assert.equal(4, utf8.offset("äaäaä", 2, 3))
+    assert.equal(7, utf8.offset("äaäaä", 2, -3))
+    -- corner cases and errors
+    assert.has.errors(function() utf8.offset("äaäaä", 1, 2) end)
+    assert.has.errors(function() utf8.offset("äaäaä", 1, 5) end)
+end
+
+return T
+
+-- vim: et:sw=4:ts=8:sts=4:tw=80
diff --git a/tests/style/test_luacheck.lua b/tests/style/test_luacheck.lua
@@ -19,6 +19,7 @@ function T.test_luacheck ()
         "ipc_channel",
         "string.wlen",
         "regex",
+        "utf8",
     }
     local ui_globals = {
         "sqlite3",