From b44af78d04a519c1bfd8311ea009a063e30e67f5 Mon Sep 17 00:00:00 2001 From: Daniel Markstedt Date: Mon, 23 Feb 2026 23:14:18 +0100 Subject: [PATCH 1/6] port UTF8 string manipulation modules from bstrlib Port the UTF-8 string manipulation modules from bstrlib to this fork. Credits to Paul Hsieh. utf8util is a standalone low-level module providing a forward iterator over UTF-8 byte sequences (utf8IteratorInit, utf8IteratorGetNextCodePoint, utf8IteratorGetCurrCodePoint, utf8ScanBackwardsForCodePoint) along with the cpUcs4/cpUcs2 type definitions and the isLegalUnicodeCodePoint macro. buniutil builds on top of it and bstrlib to provide four higher-level functions: buIsUTF8Content, buAppendBlkUcs4, buGetBlkUTF16, and buAppendBlkUTF16. Both modules are compiled into the main libbstring binary, enabled by default and controlled by the new enable-utf8 build option. Two adaptations were made to fit bstring's conventions: const_bstring was replaced with const bstring throughout (bstring dropped that typedef), and BSTR_PUBLIC visibility attributes were added to all public declarations. A new test module tests/testutf8.c was written from scratch, covering the full API surface including ASCII and multi-byte iteration, error recovery, surrogate pair encoding/decoding, BOM handling, and null/invalid-argument guards. --- README.md | 4 +- bstring/buniutil.c | 317 +++++++++++++++++++++ bstring/buniutil.h | 100 +++++++ bstring/meson.build | 12 +- bstring/utf8util.c | 300 ++++++++++++++++++++ bstring/utf8util.h | 106 +++++++ doc/introduction.md | 26 +- meson_options.txt | 6 + tests/meson.build | 12 + tests/testutf8.c | 668 ++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 1546 insertions(+), 5 deletions(-) create mode 100644 bstring/buniutil.c create mode 100644 bstring/buniutil.h create mode 100644 bstring/utf8util.c create mode 100644 bstring/utf8util.h create mode 100644 tests/testutf8.c diff --git a/README.md b/README.md index af6d18b..a77ba33 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Among the features achieved are: expect '\0' terminated char buffers - Improved overall performance of common string operations - Functional equivalency with other more modern languages +- Optional API for manipulating UTF-8 encoded strings ## bstring fork @@ -27,8 +28,7 @@ features (or mis-features, depending on your point of view) are included: 2. Improved test suite using the [Check][] library 3. Continuous integration via GitHub Actions, including memory profiling with [Valgrind][] 4. Remove C++ wrapper code, returning this to a pure C library -5. No UTF8 string manipulation support -6. Documentation generation with [Doxygen][] +5. Documentation generation with [Doxygen][] Currently this fork should be binary-compatible with the original code. The only source incompatibility is the removal of the `const_bstring` type. diff --git a/bstring/buniutil.c b/bstring/buniutil.c new file mode 100644 index 0000000..004a061 --- /dev/null +++ b/bstring/buniutil.c @@ -0,0 +1,317 @@ +/* Copyright 2002-2015 Paul Hsieh + * This file is part of Bstrlib. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of bstrlib nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * GNU General Public License Version 2 (the "GPL"). + */ + +/* + * buniutil.c + * + * This file is not necessarily part of the core bstring library itself, but + * is an implementation of basic UTF-8 processing for bstrings. This module + * depends on bstrlib.c and utf8util.c. + */ + +#include "bstrlib.h" +#include "buniutil.h" + +#define UNICODE__CODE_POINT__REPLACEMENT_CHARACTER (0xFFFDL) + +/* int buIsUTF8Content (const bstring bu) + * + * Scan string and return 1 if its entire contents is entirely UTF-8 code + * points. Otherwise return 0. + */ +int +buIsUTF8Content(const bstring bu) +{ + struct utf8Iterator iter; + + if (NULL == bdata(bu)) return 0; + for (utf8IteratorInit(&iter, bu->data, bu->slen); + iter.next < iter.slen;) { + if (0 >= utf8IteratorGetNextCodePoint(&iter, -1)) return 0; + } + return 1; +} + +/* int buGetBlkUTF16 (cpUcs2 *ucs2, int len, cpUcs4 errCh, + * const bstring bu, int pos) + * + * Convert a string of UTF-8 code points (bu) skipping the first pos code + * points, into a sequence of UTF-16 encoded code points. Returns the + * number of UCS-2 16-bit words written to the output. No more than len + * words are written to the target array ucs2. If any code point in bu is + * unparsable, it will be translated to errCh. + */ +int +buGetBlkUTF16(/* @out */ cpUcs2 *ucs2, int len, cpUcs4 errCh, + const bstring bu, int pos) +{ + struct tagbstring t; + struct utf8Iterator iter; + cpUcs4 ucs4; + int i, j; + + if (!isLegalUnicodeCodePoint(errCh)) + errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER; + if (NULL == ucs2 || 0 >= len || NULL == bdata(bu) || 0 > pos) + return BSTR_ERR; + + for (j=0, i=0; j < bu->slen; j++) { + if (0x80 != (0xC0 & bu->data[j])) { + if (i >= pos) break; + i++; + } + } + + t.mlen = -1; + t.data = bu->data + j; + t.slen = bu->slen - j; + + utf8IteratorInit(&iter, t.data, t.slen); + + ucs4 = BSTR_ERR; + for (i=0; 0 < len && iter.next < iter.slen && + 0 <= (ucs4 = utf8IteratorGetNextCodePoint(&iter, errCh)); + i++) { + if (ucs4 < 0x10000) { + *ucs2++ = (cpUcs2) ucs4; + len--; + } else { + if (len < 2) { + *ucs2++ = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER; + len--; + } else { + long y = ucs4 - 0x10000; + ucs2[0] = (cpUcs2) (0xD800 | (y >> 10)); + ucs2[1] = (cpUcs2) (0xDC00 | (y & 0x03FF)); + len -= 2; + ucs2 += 2; + i++; + } + } + } + while (0 < len) { + *ucs2++ = 0; + len--; + } + + utf8IteratorUninit(&iter); + if (0 > ucs4) return BSTR_ERR; + return i; +} + +/* + +Unicode UTF-8 +------- ----- +U-00000000 - U-0000007F: 0xxxxxxx +U-00000080 - U-000007FF: 110xxxxx 10xxxxxx +U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx +U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + +UTF-32: U-000000 - U-10FFFF + +*/ + +/* int buAppendBlkUcs4 (bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh) + * + * Convert an array of UCS-4 code points (bu) to UTF-8 code points and + * append to b. Any invalid code point is replaced by errCh. If errCh is + * itself not a valid code point, then this translation will halt upon the + * first error and return BSTR_ERR. Otherwise BSTR_OK is returned. + */ +int +buAppendBlkUcs4(bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh) +{ + int i, oldSlen; + + if (NULL == bu || NULL == b || 0 > len || + 0 > (oldSlen = blengthe(b, -1))) return BSTR_ERR; + if (!isLegalUnicodeCodePoint(errCh)) errCh = ~0; + + for (i=0; i < len; i++) { + unsigned char c[6]; + cpUcs4 v = bu[i]; + + if (!isLegalUnicodeCodePoint(v)) { + if (~0 == errCh) { + b->slen = oldSlen; + return BSTR_ERR; + } + v = errCh; + } + + if (v < 0x80) { + if (BSTR_OK != bconchar(b, (char) v)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else if (v < 0x800) { + c[0] = (unsigned char) ( (v >> 6) + 0xc0); + c[1] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk(b, c, 2)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else if (v < 0x10000) { + c[0] = (unsigned char) ( (v >> 12) + 0xe0); + c[1] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[2] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk(b, c, 3)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else +#if 0 + if (v < 0x200000) +#endif + { + c[0] = (unsigned char) ( (v >> 18) + 0xf0); + c[1] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); + c[2] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[3] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk(b, c, 4)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } +#if 0 + else if (v < 0x4000000) { + c[0] = (unsigned char) ( (v >> 24) + 0xf8); + c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80); + c[2] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); + c[3] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[4] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk(b, c, 5)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else { + c[0] = (unsigned char) ( (v >> 30) + 0xfc); + c[1] = (unsigned char) (((v >> 24) & 0x3f) + 0x80); + c[2] = (unsigned char) (((v >> 18) & 0x3f) + 0x80); + c[3] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); + c[4] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[5] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk(b, c, 6)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } +#endif + } + return BSTR_OK; +} + +#define endSwap(cs, mode) \ + ((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs)) +#define TEMP_UCS4_BUFFER_SIZE (64) + +/* int buAppendBlkUTF16 (bstring bu, const cpUcs2 *utf16, int len, + * cpUcs2 *bom, cpUcs4 errCh) + * + * Append an array of UCS-2 code units (utf16) as UTF-8 to bstring bu. + * Any invalid code point is replaced by errCh. If errCh is itself not a + * valid code point, then this translation will halt upon the first error + * and return BSTR_ERR. Otherwise BSTR_OK is returned. If a byte order + * mark has been previously read, it may be passed in as bom, otherwise if + * *bom is set to 0, it will be filled in with the BOM as read from the + * first character if it is a BOM. + */ +int +buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom, + cpUcs4 errCh) +{ + cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE]; + int cc, i, sm, oldSlen; + + if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR; + if (!isLegalUnicodeCodePoint(errCh)) errCh = ~0; + if (len == 0) return BSTR_OK; + + oldSlen = bu->slen; + i = 0; + + /* Check for BOM character and select endianness. Also remove the + BOM from the stream, since there is no need for it in UTF-8. */ + if (bom && (cpUcs2) 0xFFFE == *bom) { + sm = 8; + } else if (bom && (cpUcs2) 0xFEFF == *bom) { + sm = 0; + } else if (utf16[i] == (cpUcs2) 0xFFFE) { + if (bom) *bom = utf16[i]; + sm = 8; + i++; + } else if (utf16[i] == (cpUcs2) 0xFEFF) { + if (bom) *bom = utf16[i]; + sm = 0; + i++; + } else { + sm = 0; /* Assume local endianness. */ + } + + cc = 0; + for (; i < len; i++) { + cpUcs4 c, v; + v = endSwap(utf16[i], sm); + + if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */ + if (v >= 0xDC00 || i >= len) { + ErrMode:; + if (~0 == errCh) { + ErrReturn:; + bu->slen = oldSlen; + return BSTR_ERR; + } + v = errCh; + } else { + i++; + if ((c = endSwap(utf16[i], sm) - 0xDC00) > 0x3FF) + goto ErrMode; + v = ((v - 0xD800) << 10) + c + 0x10000; + } + } + buff[cc] = v; + cc++; + if (cc >= TEMP_UCS4_BUFFER_SIZE) { + if (0 > buAppendBlkUcs4(bu, buff, cc, errCh)) + goto ErrReturn; + cc = 0; + } + } + if (cc > 0 && 0 > buAppendBlkUcs4(bu, buff, cc, errCh)) goto ErrReturn; + + return BSTR_OK; +} diff --git a/bstring/buniutil.h b/bstring/buniutil.h new file mode 100644 index 0000000..7114991 --- /dev/null +++ b/bstring/buniutil.h @@ -0,0 +1,100 @@ +/* Copyright 2002-2015 Paul Hsieh + * This file is part of Bstrlib. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of bstrlib nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * GNU General Public License Version 2 (the "GPL"). + */ + +/** + * \file + * \brief Interface for basic Unicode utility functions for bstrings. + * + * Depends on bstrlib.h and utf8util.h. + */ + +#ifndef BSTRLIB_UNICODE_UTILITIES +#define BSTRLIB_UNICODE_UTILITIES + +#include "bstrlib.h" +#include "utf8util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Scan a bstring and return 1 if its entire content consists of valid UTF-8 + * encoded code points, otherwise return 0. + */ +BSTR_PUBLIC int +buIsUTF8Content(const bstring bu); + +/** + * Convert an array of UCS-4 code points (bu, len elements) to UTF-8 and + * append the result to the bstring b. + * + * Any invalid code point is replaced by errCh. If errCh is itself not a + * valid code point, translation halts on the first error and BSTR_ERR is + * returned. Otherwise BSTR_OK is returned. + */ +BSTR_PUBLIC int +buAppendBlkUcs4(bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh); + +/* For those unfortunate enough to be stuck supporting UTF-16. */ + +/** + * Convert the UTF-8 bstring bu (starting at code-point offset pos) to a + * sequence of UTF-16 encoded code units written to ucs2 (at most len units). + * + * Returns the number of UCS-2 16-bit words written. Any unparsable code + * point is translated to errCh. + */ +BSTR_PUBLIC int +buGetBlkUTF16(/* @out */ cpUcs2 *ucs2, int len, cpUcs4 errCh, + const bstring bu, int pos); + +/** + * Append an array of UTF-16 code units (utf16, len elements) to the UTF-8 + * bstring bu. + * + * Any invalid code point is replaced by errCh. If errCh is itself not a + * valid code point, translation halts on the first error and BSTR_ERR is + * returned. Otherwise BSTR_OK is returned. If a byte order mark has been + * previously read it may be passed in via bom; if *bom is 0 it will be + * filled in from the first character if it is a BOM. + */ +BSTR_PUBLIC int +buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom, + cpUcs4 errCh); + +#ifdef __cplusplus +} +#endif + +#endif /* BSTRLIB_UNICODE_UTILITIES */ diff --git a/bstring/meson.build b/bstring/meson.build index 01a8043..bb46e8c 100644 --- a/bstring/meson.build +++ b/bstring/meson.build @@ -1,8 +1,16 @@ -install_headers(['bstraux.h', 'bstrlib.h']) +bstring_sources = ['bstraux.c', 'bstrlib.c'] +bstring_headers = ['bstraux.h', 'bstrlib.h'] + +if get_option('enable-utf8') + bstring_sources += ['buniutil.c', 'utf8util.c'] + bstring_headers += ['buniutil.h', 'utf8util.h'] +endif + +install_headers(bstring_headers) libbstring = library( meson.project_name(), - ['bstraux.c', 'bstrlib.c'], + bstring_sources, version: meson.project_version(), soversion: '1', include_directories: bstring_inc, diff --git a/bstring/utf8util.c b/bstring/utf8util.c new file mode 100644 index 0000000..48cf540 --- /dev/null +++ b/bstring/utf8util.c @@ -0,0 +1,300 @@ +/* Copyright 2002-2015 Paul Hsieh + * This file is part of Bstrlib. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of bstrlib nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * GNU General Public License Version 2 (the "GPL"). + */ + +/* + * utf8util.c + * + * This file is not necessarily part of the core bstring library itself, but + * is a generic module for implementing UTF-8 utility functions. + */ + +#include "utf8util.h" + +#ifndef NULL +#ifdef __cplusplus +#define NULL 0 +#else +#define NULL ((void *)0) +#endif +#endif + +/* Surrogate range is wrong, there is a maximum, the BOM alias is illegal + and 0xFFFF is illegal */ +#define isLegalUnicodeCodePoint(v) \ + ((((v) < 0xD800L) || ((v) > 0xDFFFL)) && \ + (((unsigned long)(v)) <= 0x0010FFFFL) && \ + (((v)|0x1F0001) != 0x1FFFFFL)) + +void +utf8IteratorInit(struct utf8Iterator *iter, unsigned char *data, int slen) +{ + if (iter) { + iter->data = data; + iter->slen = (iter->data && slen >= 0) ? slen : -1; + iter->start = -1; + iter->next = (iter->slen >= 0) ? 0 : -1; + iter->error = (iter->slen >= 0) ? 0 : 1; + } +} + +void +utf8IteratorUninit(struct utf8Iterator *iter) +{ + if (iter) { + iter->data = NULL; + iter->slen = -1; + iter->start = iter->next = -1; + } +} + +int +utf8ScanBackwardsForCodePoint(unsigned char *msg, int len, int pos, + cpUcs4 *out) +{ + cpUcs4 v1, v2, v3, v4, x; + int ret; + if (NULL == msg || len < 0 || (unsigned) pos >= (unsigned) len) { + return -__LINE__; + } + if (!out) out = &x; + ret = 0; + if (msg[pos] < 0x80) { + *out = msg[pos]; + return 0; + } else if (msg[pos] < 0xC0) { + if (0 == pos) return -__LINE__; + ret = -__LINE__; + if (msg[pos-1] >= 0xC1 && msg[pos-1] < 0xF8) { + pos--; + ret = 1; + } else { + if (1 == pos) return -__LINE__; + if ((msg[pos-1] | 0x3F) != 0xBF) return -__LINE__; + if (msg[pos-2] >= 0xE0 && msg[pos-2] < 0xF8) { + pos -= 2; + ret = 2; + } else { + if (2 == pos) return -__LINE__; + if ((msg[pos-2] | 0x3F) != 0xBF) return -__LINE__; + if ((msg[pos-3]|0x07) == 0xF7) { + pos -= 3; + ret = 3; + } else return -__LINE__; + } + } + } + if (msg[pos] < 0xE0) { + if (pos + 1 >= len) return -__LINE__; + v1 = msg[pos] & ~0xE0; + v2 = msg[pos+1] & ~0xC0; + v1 = (v1 << 6) + v2; + if (v1 < 0x80) return -__LINE__; + *out = v1; + return ret; + } + if (msg[pos] < 0xF0) { + if (pos + 2 >= len) return -__LINE__; + v1 = msg[pos] & ~0xF0; + v2 = msg[pos+1] & ~0xC0; + v3 = msg[pos+2] & ~0xC0; + v1 = (v1 << 12) + (v2 << 6) + v3; + if (v1 < 0x800) return -__LINE__; + if (!isLegalUnicodeCodePoint(v1)) return -__LINE__; + *out = v1; + return ret; + } + + if (msg[pos] >= 0xF8) return -__LINE__; + + if (pos + 3 >= len) return -__LINE__; + v1 = msg[pos] & ~0xF8; + v2 = msg[pos+1] & ~0xC0; + v3 = msg[pos+2] & ~0xC0; + v4 = msg[pos+3] & ~0xC0; + v1 = (v1 << 18) + (v2 << 12) + (v3 << 6) + v4; + if (v1 < 0x10000) return -__LINE__; + if (!isLegalUnicodeCodePoint(v1)) return -__LINE__; + *out = v1; + return ret; +} + +/* +Code point UTF-8 +---------- ----- +U-00000000 - U-0000007F: 0xxxxxxx +U-00000080 - U-000007FF: 110xxxxx 10xxxxxx +U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx +U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +*/ + +/* + * Returns next read code point for iterator. + * + * iter->data + iter->start points at the characters just read. + * + * iter->data + iter->next points at the characters that will be read next. + * + * iter->error is boolean indicating whether or not last read contained + * an error. + */ +cpUcs4 +utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) +{ + unsigned char *chrs; + unsigned char c, d, e; + long v; + int i, ofs; + + if (NULL == iter || iter->next < 0) return errCh; + if (iter->next >= iter->slen) { + iter->start = iter->slen; + return errCh; + } + if (NULL == iter->data || iter->next < 0 || + utf8IteratorNoMore(iter)) return errCh; + chrs = iter->data + iter->next; + + iter->error = 0; + c = chrs[0]; + ofs = 0; + + if (c < 0xC0 || c > 0xFD) { + if (c >= 0x80) goto ErrMode; + v = c; + ofs = 1; + } else if (c < 0xE0) { + if (iter->next >= iter->slen + 1) goto ErrMode; + v = (c << 6u) - (0x0C0 << 6u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + v += c; + if (c >= 0x40 || v < 0x80) goto ErrMode; + ofs = 2; + } else if (c < 0xF0) { + if (iter->next >= iter->slen + 2) goto ErrMode; + v = (c << 12) - (0x0E0 << 12u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + v += (c << 6u) + d; + if ((c|d) >= 0x40 || v < 0x800 || + !isLegalUnicodeCodePoint(v)) goto ErrMode; + ofs = 3; + } else if (c < 0xF8) { + if (iter->next >= iter->slen + 3) goto ErrMode; + v = (c << 18) - (0x0F0 << 18u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + e = (unsigned char) ((unsigned) chrs[3] - 0x080); + v += (c << 12u) + (d << 6u) + e; + if ((c|d|e) >= 0x40 || v < 0x10000 || + !isLegalUnicodeCodePoint(v)) goto ErrMode; + ofs = 4; + } else { /* 5 and 6 byte encodings are invalid */ + ErrMode:; + iter->error = 1; + v = errCh; + for (i = iter->next+1; i < iter->slen; i++) { + if ((iter->data[i] & 0xC0) != 0x80) break; + } + ofs = i - iter->next; + } + + iter->start = iter->next; + iter->next += ofs; + return v; +} + +/* + * Returns current code point for iterator without advancing. + * + * iter->data + iter->start points at the characters to be read. + * + * iter->data + iter->next points at the characters that will be read next. + * + * iter->error is boolean indicating whether or not last read contained + * an error. + */ +cpUcs4 +utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) +{ + unsigned char *chrs; + unsigned char c, d, e; + long v; + + if (NULL == iter || iter->next < 0) return errCh; + if (iter->next >= iter->slen) { + iter->start = iter->slen; + return errCh; + } + if (NULL == iter->data || iter->next < 0 || + utf8IteratorNoMore(iter)) return errCh; + chrs = iter->data + iter->next; + + iter->error = 0; + c = chrs[0]; + + if (c < 0xC0 || c > 0xFD) { + if (c >= 0x80) goto ErrMode; + v = c; + } else if (c < 0xE0) { + if (iter->next >= iter->slen + 1) goto ErrMode; + v = (c << 6u) - (0x0C0 << 6u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + v += c; + if (c >= 0x40 || v < 0x80) goto ErrMode; + } else if (c < 0xF0) { + if (iter->next >= iter->slen + 2) goto ErrMode; + v = (c << 12lu) - (0x0E0 << 12u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + v += (c << 6u) + d; + if ((c|d) >= 0x40 || v < 0x800 || + !isLegalUnicodeCodePoint(v)) goto ErrMode; + } else if (c < 0xF8) { + if (iter->next >= iter->slen + 3) goto ErrMode; + v = (c << 18lu) - (0x0F0 << 18u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + e = (unsigned char) ((unsigned) chrs[3] - 0x080); + v += (c << 12lu) + (d << 6u) + e; + if ((c|d|e) >= 0x40 || v < 0x10000 || + !isLegalUnicodeCodePoint(v)) goto ErrMode; + } else { /* 5 and 6 byte encodings are invalid */ + ErrMode:; + iter->error = 1; + v = errCh; + } + return v; +} diff --git a/bstring/utf8util.h b/bstring/utf8util.h new file mode 100644 index 0000000..2d2a7ca --- /dev/null +++ b/bstring/utf8util.h @@ -0,0 +1,106 @@ +/* Copyright 2002-2015 Paul Hsieh + * This file is part of Bstrlib. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of bstrlib nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * GNU General Public License Version 2 (the "GPL"). + */ + +/** + * \file + * \brief Interface for low-level UTF-8 utility functions. + * + * This module is standalone and does not depend on bstrlib. + */ + +#ifndef UTF8_UNICODE_UTILITIES +#define UTF8_UNICODE_UTILITIES + +#include + +/* If bstrlib.h has not been included, define the visibility attribute here. + The #ifndef guard ensures we don't conflict if bstrlib.h came first. */ +#ifndef BSTR_PUBLIC +# if __GNUC__ >= 4 +# define BSTR_PUBLIC __attribute__ ((visibility ("default"))) +# else +# define BSTR_PUBLIC +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if INT_MAX >= 0x7fffffffUL +typedef int cpUcs4; +#elif LONG_MAX >= 0x7fffffffUL +typedef long cpUcs4; +#else +#error This compiler is not supported +#endif + +#if UINT_MAX == 0xFFFF +typedef unsigned int cpUcs2; +#elif USHRT_MAX == 0xFFFF +typedef unsigned short cpUcs2; +#elif UCHAR_MAX == 0xFFFF +typedef unsigned char cpUcs2; +#else +#error This compiler is not supported +#endif + +#define isLegalUnicodeCodePoint(v) \ + ((((v) < 0xD800L) || ((v) > 0xDFFFL)) && \ + (((unsigned long)(v)) <= 0x0010FFFFL) && \ + (((v)|0x1F0001) != 0x1FFFFFL)) + +struct utf8Iterator { + unsigned char *data; + int slen; + int start, next; + int error; +}; + +#define utf8IteratorNoMore(it) (!(it) || (it)->next >= (it)->slen) + +BSTR_PUBLIC void utf8IteratorInit(struct utf8Iterator *iter, + unsigned char *data, int slen); +BSTR_PUBLIC void utf8IteratorUninit(struct utf8Iterator *iter); +BSTR_PUBLIC cpUcs4 utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, + cpUcs4 errCh); +BSTR_PUBLIC cpUcs4 utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, + cpUcs4 errCh); +BSTR_PUBLIC int utf8ScanBackwardsForCodePoint(unsigned char *msg, int len, + int pos, cpUcs4 *out); + +#ifdef __cplusplus +} +#endif + +#endif /* UTF8_UNICODE_UTILITIES */ diff --git a/doc/introduction.md b/doc/introduction.md index c140d48..11f8ec7 100644 --- a/doc/introduction.md +++ b/doc/introduction.md @@ -327,7 +327,7 @@ object in a multithreaded environment. Problems Not Solved ------------------- -Bstrlib is written for the C languages, which have inherent weaknesses that +Bstrlib is written for the C language, which has inherent weaknesses that cannot be easily solved: 1. Memory leaks: Forgetting to call `bdestroy` on a bstring that is about to @@ -349,6 +349,29 @@ Other problems not addressed: > Note: except for spotty support of wide characters, the default C standard library does not address any of these problems either. +Unicode functions +----------------- + +The two modules utf8util.c and buniutil.c implement basic functions for +parsing and collecting Unicode data in the UTF8 format. Unicode is +described by a sequence of "code points" which are values between 0 and +1114111 inclusive mapped to symbol content corresponding to nearly all +the standardized scripts of the world. + +The semantics of Unicode code points is varied and complicated. The +base support of the better string library does not attempt to perform +any interpretation of these code points. The better string library +solely provides support for iterating through unicode code points, +appending and extracting code points to and from bstrings, and parsing +UTF8 and UTF16 from raw data. + +The types cpUcs4 and cpUcs2 respectively are defined as 4 byte and 2 byte +encoding formats corresponding to UCS4 and UCS2 respectively. To test +if a raw code point is valid, the macro isLegalUnicodeCodePoint() has +been defined. The utf8 iterator is defined by struct utf8Iterator. To +test if the iterator has more code points to walk through the macro +utf8IteratorNoMore() has been defined. + The `bstest` Module ------------------- @@ -871,3 +894,4 @@ and testing of the Better String Library: * Richard A. Smith * Simon Ekstrom * Wayne Scott +* Zed A. Shaw diff --git a/meson_options.txt b/meson_options.txt index 6b0b4fd..5bf78df 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -16,3 +16,9 @@ option( value: false, description: 'Build unit tests', ) +option( + 'enable-utf8', + type: 'boolean', + value: true, + description: 'Build bstring library with UTF-8 support', +) diff --git a/tests/meson.build b/tests/meson.build index 66b94bf..28ac1c0 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -15,3 +15,15 @@ test_executable_aux = executable( test('bstring unit tests', test_executable) test('bstring auxiliary unit tests', test_executable_aux) + +if get_option('enable-utf8') + test_executable_utf8 = executable( + 'testutf8', + 'testutf8.c', + link_with: libbstring, + include_directories: bstring_inc, + dependencies: check, + ) + + test('bstring UTF-8 unit tests', test_executable_utf8) +endif diff --git a/tests/testutf8.c b/tests/testutf8.c new file mode 100644 index 0000000..1e28d17 --- /dev/null +++ b/tests/testutf8.c @@ -0,0 +1,668 @@ +/* Copyright (C) 2026 Daniel Markstedt + * UTF-8 unit tests for the Better String Library + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of bstrlib nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * GNU General Public License Version 2 (the "GPL"). + */ + +/* + * This file is the C unit test for the UTF-8 modules (utf8util, buniutil). + * + * Test data quick reference: + * U+0041 'A' = 0x41 (1-byte ASCII) + * U+00A9 '©' = 0xC2 0xA9 (2-byte) + * U+20AC '€' = 0xE2 0x82 0xAC (3-byte) + * U+1F600 '😀' = 0xF0 0x9F 0x98 0x80 (4-byte) + * UTF-16 U+1F600 = { 0xD83D, 0xDE00 } (surrogate pair) + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "buniutil.h" +#include "bstrlib.h" +#include "utf8util.h" +#include +#include +#include + +/* ----------------------------------------------------------------------- + * core_000: utf8IteratorInit — valid inputs and error inputs + * ----------------------------------------------------------------------- */ +START_TEST(core_000) +{ + struct utf8Iterator iter; + unsigned char data[] = "Hello"; + + /* NULL iter pointer must not crash */ + utf8IteratorInit(NULL, data, 5); + + /* Valid initialisation */ + utf8IteratorInit(&iter, data, 5); + ck_assert_int_eq(iter.slen, 5); + ck_assert_int_eq(iter.next, 0); + ck_assert_int_eq(iter.start, -1); + ck_assert_int_eq(iter.error, 0); + ck_assert(iter.data == data); + + /* NULL data → sentinel values */ + utf8IteratorInit(&iter, NULL, 5); + ck_assert_int_eq(iter.slen, -1); + ck_assert_int_eq(iter.next, -1); + ck_assert_int_eq(iter.error, 1); + + /* Negative slen → sentinel values */ + utf8IteratorInit(&iter, data, -1); + ck_assert_int_eq(iter.slen, -1); + ck_assert_int_eq(iter.next, -1); + ck_assert_int_eq(iter.error, 1); + + /* Zero-length string is valid */ + utf8IteratorInit(&iter, data, 0); + ck_assert_int_eq(iter.slen, 0); + ck_assert_int_eq(iter.next, 0); + ck_assert_int_eq(iter.error, 0); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_001: utf8IteratorUninit — clears all fields; handles NULL gracefully + * ----------------------------------------------------------------------- */ +START_TEST(core_001) +{ + struct utf8Iterator iter; + unsigned char data[] = "Hello"; + + utf8IteratorInit(&iter, data, 5); + utf8IteratorUninit(&iter); + ck_assert(iter.data == NULL); + ck_assert_int_eq(iter.slen, -1); + ck_assert_int_eq(iter.start, -1); + ck_assert_int_eq(iter.next, -1); + + /* NULL pointer must not crash */ + utf8IteratorUninit(NULL); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_002: utf8IteratorGetNextCodePoint — ASCII string iteration + * ----------------------------------------------------------------------- */ +START_TEST(core_002) +{ + struct utf8Iterator iter; + unsigned char data[] = "ABC"; + cpUcs4 cp; + + utf8IteratorInit(&iter, data, 3); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 'A'); + ck_assert_int_eq(iter.error, 0); + ck_assert_int_eq(iter.start, 0); + ck_assert_int_eq(iter.next, 1); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 'B'); + ck_assert_int_eq(iter.start, 1); + ck_assert_int_eq(iter.next, 2); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 'C'); + ck_assert_int_eq(iter.start, 2); + ck_assert_int_eq(iter.next, 3); + + /* Past end → errCh */ + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + + /* NULL iterator → errCh */ + cp = utf8IteratorGetNextCodePoint(NULL, '?'); + ck_assert_int_eq(cp, '?'); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_003: utf8IteratorGetNextCodePoint — multi-byte sequences + * + * Sequence: © (U+00A9, 2-byte) € (U+20AC, 3-byte) 😀 (U+1F600, 4-byte) + * Bytes: C2 A9 E2 82 AC F0 9F 98 80 + * ----------------------------------------------------------------------- */ +START_TEST(core_003) +{ + struct utf8Iterator iter; + /* © € 😀 */ + unsigned char data[] = { + 0xC2, 0xA9, /* U+00A9 © */ + 0xE2, 0x82, 0xAC, /* U+20AC € */ + 0xF0, 0x9F, 0x98, 0x80 /* U+1F600 😀 */ + }; + cpUcs4 cp; + + utf8IteratorInit(&iter, data, sizeof(data)); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x00A9); + ck_assert_int_eq(iter.error, 0); + ck_assert_int_eq(iter.start, 0); + ck_assert_int_eq(iter.next, 2); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x20AC); + ck_assert_int_eq(iter.error, 0); + ck_assert_int_eq(iter.start, 2); + ck_assert_int_eq(iter.next, 5); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x1F600); + ck_assert_int_eq(iter.error, 0); + ck_assert_int_eq(iter.start, 5); + ck_assert_int_eq(iter.next, 9); + + /* Exhausted */ + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_004: utf8IteratorGetNextCodePoint — invalid byte sequences + * + * 0x80 alone is a stray continuation byte (invalid lead). + * 0xFF is never valid in UTF-8. + * ----------------------------------------------------------------------- */ +START_TEST(core_004) +{ + struct utf8Iterator iter; + /* stray continuation, then a valid ASCII char */ + unsigned char data_cont[] = { 0x80, 0x41 }; + /* 0xFF is always invalid */ + unsigned char data_ff[] = { 0xFF, 0x41 }; + cpUcs4 cp; + + /* Stray continuation byte → error, iterator skips to next valid lead */ + utf8IteratorInit(&iter, data_cont, sizeof(data_cont)); + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + /* After error the iterator should have advanced past the bad byte(s) */ + ck_assert(iter.next > 0); + + /* 0xFF lead byte → error */ + utf8IteratorInit(&iter, data_ff, sizeof(data_ff)); + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_005: utf8IteratorGetCurrCodePoint — peek without advancing + * ----------------------------------------------------------------------- */ +START_TEST(core_005) +{ + struct utf8Iterator iter; + unsigned char data[] = { 0xC2, 0xA9, 0x41 }; /* © A */ + cpUcs4 cp; + + utf8IteratorInit(&iter, data, sizeof(data)); + + /* Peek twice at the same position — must not advance */ + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x00A9); + ck_assert_int_eq(iter.next, 0); /* still at start */ + + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x00A9); + ck_assert_int_eq(iter.next, 0); + + /* Now advance with GetNext, then peek the next char */ + utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(iter.next, 2); + + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x41); /* 'A' */ + ck_assert_int_eq(iter.next, 2); /* still not advanced */ + + /* NULL iterator → errCh */ + cp = utf8IteratorGetCurrCodePoint(NULL, '?'); + ck_assert_int_eq(cp, '?'); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_006: utf8ScanBackwardsForCodePoint — various positions + * + * Data: © (0xC2 0xA9) at bytes 0-1, then 'A' (0x41) at byte 2 + * ----------------------------------------------------------------------- */ +START_TEST(core_006) +{ + unsigned char data[] = { 0xC2, 0xA9, 0x41 }; /* © A */ + cpUcs4 out; + int ret; + + /* pos=0 is the lead byte of © — ret=0, out=0xA9 */ + ret = utf8ScanBackwardsForCodePoint(data, 3, 0, &out); + ck_assert_int_eq(ret, 0); + ck_assert_int_eq(out, 0x00A9); + + /* pos=1 is the continuation byte — ret=1 (1 byte back to lead), out=0xA9 */ + ret = utf8ScanBackwardsForCodePoint(data, 3, 1, &out); + ck_assert_int_eq(ret, 1); + ck_assert_int_eq(out, 0x00A9); + + /* pos=2 is ASCII 'A' — ret=0, out='A' */ + ret = utf8ScanBackwardsForCodePoint(data, 3, 2, &out); + ck_assert_int_eq(ret, 0); + ck_assert_int_eq(out, 0x41); + + /* NULL msg → error (negative) */ + ret = utf8ScanBackwardsForCodePoint(NULL, 3, 0, &out); + ck_assert(ret < 0); + + /* pos out of range → error */ + ret = utf8ScanBackwardsForCodePoint(data, 3, 3, &out); + ck_assert(ret < 0); + + /* out=NULL is accepted; return value indicates success/failure */ + ret = utf8ScanBackwardsForCodePoint(data, 3, 2, NULL); + ck_assert_int_eq(ret, 0); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_007: buIsUTF8Content + * ----------------------------------------------------------------------- */ +START_TEST(core_007) +{ + bstring b; + int ret; + + /* NULL bstring → 0 */ + ret = buIsUTF8Content(NULL); + ck_assert_int_eq(ret, 0); + + /* Empty string → 1 (vacuously true) */ + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buIsUTF8Content(b); + ck_assert_int_eq(ret, 1); + bdestroy(b); + + /* Pure ASCII → 1 */ + b = bfromcstr("Hello, world!"); + ck_assert(b != NULL); + ret = buIsUTF8Content(b); + ck_assert_int_eq(ret, 1); + bdestroy(b); + + /* Valid multi-byte UTF-8: © € 😀 */ + { + unsigned char utf8[] = { + 0xC2, 0xA9, + 0xE2, 0x82, 0xAC, + 0xF0, 0x9F, 0x98, 0x80 + }; + b = blk2bstr(utf8, sizeof(utf8)); + ck_assert(b != NULL); + ret = buIsUTF8Content(b); + ck_assert_int_eq(ret, 1); + bdestroy(b); + } + + /* Invalid: stray 0x80 continuation byte → 0 */ + { + unsigned char bad[] = { 0x41, 0x80, 0x41 }; + b = blk2bstr(bad, sizeof(bad)); + ck_assert(b != NULL); + ret = buIsUTF8Content(b); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* Invalid: truncated 2-byte sequence → 0 */ + { + unsigned char bad[] = { 0xC2 }; /* lead without continuation */ + b = blk2bstr(bad, sizeof(bad)); + ck_assert(b != NULL); + ret = buIsUTF8Content(b); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_008: buAppendBlkUcs4 — UCS-4 array → UTF-8 bstring + * ----------------------------------------------------------------------- */ +START_TEST(core_008) +{ + bstring b; + int ret; + + /* NULL arguments → BSTR_ERR */ + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(NULL, NULL, 0, '?'); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buAppendBlkUcs4(b, NULL, 1, '?'); + ck_assert_int_eq(ret, BSTR_ERR); + bdestroy(b); + + /* ASCII code points */ + { + cpUcs4 pts[] = { 'H', 'i' }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 2, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 2); + ck_assert_int_eq(b->data[0], 'H'); + ck_assert_int_eq(b->data[1], 'i'); + bdestroy(b); + } + + /* Mixed: © (U+00A9) and € (U+20AC) */ + { + cpUcs4 pts[] = { 0x00A9, 0x20AC }; + unsigned char expected[] = { + 0xC2, 0xA9, /* © */ + 0xE2, 0x82, 0xAC /* € */ + }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 2, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 5); + ret = memcmp(b->data, expected, 5); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* 4-byte: 😀 (U+1F600) */ + { + cpUcs4 pts[] = { 0x1F600 }; + unsigned char expected[] = { 0xF0, 0x9F, 0x98, 0x80 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 1, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 4); + ret = memcmp(b->data, expected, 4); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* Invalid code point with valid errCh → substituted */ + { + cpUcs4 pts[] = { 0xD800 }; /* surrogates are illegal */ + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 1, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], '?'); + bdestroy(b); + } + + /* Invalid code point with invalid errCh → BSTR_ERR, bstring unchanged */ + { + cpUcs4 pts[] = { 0xD800 }; + b = bfromcstr("pre"); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 1, 0xD800); /* errCh also invalid */ + ck_assert_int_eq(ret, BSTR_ERR); + /* slen must be rolled back */ + ck_assert_int_eq(b->slen, 3); + bdestroy(b); + } + + /* Zero-length array → BSTR_OK, nothing appended */ + { + cpUcs4 pts[] = { 'X' }; + b = bfromcstr("pre"); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 0, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 3); + bdestroy(b); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_009: buGetBlkUTF16 — UTF-8 bstring → UTF-16 array + * ----------------------------------------------------------------------- */ +START_TEST(core_009) +{ + cpUcs2 buf[16]; + int ret; + + /* NULL arguments → BSTR_ERR */ + { + unsigned char raw[] = { 0x41 }; + bstring b = blk2bstr(raw, 1); + ck_assert(b != NULL); + ret = buGetBlkUTF16(NULL, 4, '?', b, 0); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buGetBlkUTF16(buf, 0, '?', b, 0); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buGetBlkUTF16(buf, 4, '?', NULL, 0); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buGetBlkUTF16(buf, 4, '?', b, -1); + ck_assert_int_eq(ret, BSTR_ERR); + bdestroy(b); + } + + /* ASCII "AB" → UTF-16 { 0x0041, 0x0042, 0, ... } */ + { + bstring b = bfromcstr("AB"); + ck_assert(b != NULL); + memset(buf, 0xFF, sizeof(buf)); + ret = buGetBlkUTF16(buf, 4, '?', b, 0); + ck_assert_int_eq(ret, 2); + ck_assert_int_eq(buf[0], 0x0041); + ck_assert_int_eq(buf[1], 0x0042); + ck_assert_int_eq(buf[2], 0); /* null-padded */ + bdestroy(b); + } + + /* © € → UTF-16 BMP values (U+00A9, U+20AC) */ + { + unsigned char raw[] = { + 0xC2, 0xA9, + 0xE2, 0x82, 0xAC + }; + bstring b = blk2bstr(raw, sizeof(raw)); + ck_assert(b != NULL); + memset(buf, 0xFF, sizeof(buf)); + ret = buGetBlkUTF16(buf, 4, '?', b, 0); + ck_assert_int_eq(ret, 2); + ck_assert_int_eq(buf[0], 0x00A9); + ck_assert_int_eq(buf[1], 0x20AC); + bdestroy(b); + } + + /* pos=1 skips first code point */ + { + bstring b = bfromcstr("AB"); + ck_assert(b != NULL); + memset(buf, 0, sizeof(buf)); + ret = buGetBlkUTF16(buf, 4, '?', b, 1); + ck_assert_int_eq(ret, 1); + ck_assert_int_eq(buf[0], 0x0042); + bdestroy(b); + } + + /* Supplementary character 😀 (U+1F600) → surrogate pair */ + { + unsigned char raw[] = { 0xF0, 0x9F, 0x98, 0x80 }; + bstring b = blk2bstr(raw, sizeof(raw)); + ck_assert(b != NULL); + memset(buf, 0, sizeof(buf)); + ret = buGetBlkUTF16(buf, 4, '?', b, 0); + ck_assert_int_eq(ret, 2); /* one code point → two UTF-16 units */ + ck_assert_int_eq(buf[0], 0xD83D); /* high surrogate */ + ck_assert_int_eq(buf[1], 0xDE00); /* low surrogate */ + bdestroy(b); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_010: buAppendBlkUTF16 — UTF-16 array → UTF-8 bstring + * ----------------------------------------------------------------------- */ +START_TEST(core_010) +{ + bstring b; + int ret; + + /* NULL / bad arguments → BSTR_ERR */ + b = bfromcstr(""); + ck_assert(b != NULL); + { + cpUcs2 u[] = { 0x0041 }; + ret = buAppendBlkUTF16(NULL, u, 1, NULL, '?'); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buAppendBlkUTF16(b, NULL, 1, NULL, '?'); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buAppendBlkUTF16(b, u, -1, NULL, '?'); + ck_assert_int_eq(ret, BSTR_ERR); + } + bdestroy(b); + + /* Zero-length input → BSTR_OK, nothing appended */ + { + cpUcs2 u[] = { 0x0041 }; + b = bfromcstr("pre"); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 0, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 3); + bdestroy(b); + } + + /* ASCII "AB" in UTF-16 → "AB" in UTF-8 */ + { + cpUcs2 u[] = { 0x0041, 0x0042 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 2); + ck_assert_int_eq(b->data[0], 'A'); + ck_assert_int_eq(b->data[1], 'B'); + bdestroy(b); + } + + /* BMP characters: U+00A9 © and U+20AC € */ + { + cpUcs2 u[] = { 0x00A9, 0x20AC }; + unsigned char expected[] = { + 0xC2, 0xA9, + 0xE2, 0x82, 0xAC + }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 5); + ret = memcmp(b->data, expected, 5); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* Surrogate pair: 😀 (U+1F600) = { 0xD83D, 0xDE00 } */ + { + cpUcs2 u[] = { 0xD83D, 0xDE00 }; + unsigned char expected[] = { 0xF0, 0x9F, 0x98, 0x80 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 4); + ret = memcmp(b->data, expected, 4); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* Little-endian BOM (0xFFFE) → byte-swapped input */ + { + /* 'A' (0x0041) with bytes swapped = 0x4100, plus LE BOM */ + cpUcs2 u[] = { 0xFFFE, 0x4100 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], 'A'); + bdestroy(b); + } + + /* Big-endian BOM (0xFEFF) is consumed and removed from output */ + { + cpUcs2 u[] = { 0xFEFF, 0x0041 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], 'A'); + bdestroy(b); + } +} +END_TEST + +int +main(void) +{ + /* Build test suite */ + Suite *suite = suite_create("bstr-utf8"); + /* Core tests */ + TCase *core = tcase_create("Core"); + tcase_add_test(core, core_000); + tcase_add_test(core, core_001); + tcase_add_test(core, core_002); + tcase_add_test(core, core_003); + tcase_add_test(core, core_004); + tcase_add_test(core, core_005); + tcase_add_test(core, core_006); + tcase_add_test(core, core_007); + tcase_add_test(core, core_008); + tcase_add_test(core, core_009); + tcase_add_test(core, core_010); + suite_add_tcase(suite, core); + /* Run tests */ + SRunner *runner = srunner_create(suite); + srunner_run_all(runner, CK_ENV); + int number_failed = srunner_ntests_failed(runner); + srunner_free(runner); + return (0 == number_failed) ? EXIT_SUCCESS : EXIT_FAILURE; +} From 2fcdc0fab4aa113e4b4e5a91936e46ad8e7b997f Mon Sep 17 00:00:00 2001 From: Daniel Markstedt Date: Tue, 24 Feb 2026 07:37:02 +0100 Subject: [PATCH 2/6] address static analysis bugs flagged by SonarQube --- bstring/buniutil.c | 27 +++++++++++++++++---------- bstring/utf8util.c | 23 ++++++++++++++++------- bstring/utf8util.h | 7 ++++--- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/bstring/buniutil.c b/bstring/buniutil.c index 004a061..ceafae7 100644 --- a/bstring/buniutil.c +++ b/bstring/buniutil.c @@ -78,7 +78,8 @@ buGetBlkUTF16(/* @out */ cpUcs2 *ucs2, int len, cpUcs4 errCh, struct tagbstring t; struct utf8Iterator iter; cpUcs4 ucs4; - int i, j; + int i; + int j; if (!isLegalUnicodeCodePoint(errCh)) errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER; @@ -99,9 +100,10 @@ buGetBlkUTF16(/* @out */ cpUcs2 *ucs2, int len, cpUcs4 errCh, utf8IteratorInit(&iter, t.data, t.slen); ucs4 = BSTR_ERR; - for (i=0; 0 < len && iter.next < iter.slen && - 0 <= (ucs4 = utf8IteratorGetNextCodePoint(&iter, errCh)); - i++) { + for (i=0; 0 < len && iter.next < iter.slen; i++) { + ucs4 = utf8IteratorGetNextCodePoint(&iter, errCh); + if (0 > ucs4) break; + if (ucs4 < 0x10000) { *ucs2++ = (cpUcs2) ucs4; len--; @@ -155,13 +157,14 @@ UTF-32: U-000000 - U-10FFFF int buAppendBlkUcs4(bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh) { - int i, oldSlen; + int oldSlen; - if (NULL == bu || NULL == b || 0 > len || - 0 > (oldSlen = blengthe(b, -1))) return BSTR_ERR; + if (NULL == bu || NULL == b || 0 > len) return BSTR_ERR; + oldSlen = blengthe(b, -1); + if (0 > oldSlen) return BSTR_ERR; if (!isLegalUnicodeCodePoint(errCh)) errCh = ~0; - for (i=0; i < len; i++) { + for (int i=0; i < len; i++) { unsigned char c[6]; cpUcs4 v = bu[i]; @@ -255,7 +258,10 @@ buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom, cpUcs4 errCh) { cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE]; - int cc, i, sm, oldSlen; + int cc; + int i; + int sm; + int oldSlen; if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR; if (!isLegalUnicodeCodePoint(errCh)) errCh = ~0; @@ -284,7 +290,8 @@ buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom, cc = 0; for (; i < len; i++) { - cpUcs4 c, v; + cpUcs4 c; + cpUcs4 v; v = endSwap(utf16[i], sm); if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */ diff --git a/bstring/utf8util.c b/bstring/utf8util.c index 48cf540..c6e2a80 100644 --- a/bstring/utf8util.c +++ b/bstring/utf8util.c @@ -78,10 +78,14 @@ utf8IteratorUninit(struct utf8Iterator *iter) } int -utf8ScanBackwardsForCodePoint(unsigned char *msg, int len, int pos, +utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos, cpUcs4 *out) { - cpUcs4 v1, v2, v3, v4, x; + cpUcs4 v1; + cpUcs4 v2; + cpUcs4 v3; + cpUcs4 v4; + cpUcs4 x; int ret; if (NULL == msg || len < 0 || (unsigned) pos >= (unsigned) len) { return -__LINE__; @@ -173,10 +177,13 @@ U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx cpUcs4 utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) { - unsigned char *chrs; - unsigned char c, d, e; + const unsigned char *chrs; + unsigned char c; + unsigned char d; + unsigned char e; long v; - int i, ofs; + int i; + int ofs; if (NULL == iter || iter->next < 0) return errCh; if (iter->next >= iter->slen) { @@ -249,8 +256,10 @@ utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) cpUcs4 utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) { - unsigned char *chrs; - unsigned char c, d, e; + const unsigned char *chrs; + unsigned char c; + unsigned char d; + unsigned char e; long v; if (NULL == iter || iter->next < 0) return errCh; diff --git a/bstring/utf8util.h b/bstring/utf8util.h index 2d2a7ca..84aeda0 100644 --- a/bstring/utf8util.h +++ b/bstring/utf8util.h @@ -83,7 +83,8 @@ typedef unsigned char cpUcs2; struct utf8Iterator { unsigned char *data; int slen; - int start, next; + int start; + int next; int error; }; @@ -96,8 +97,8 @@ BSTR_PUBLIC cpUcs4 utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh); BSTR_PUBLIC cpUcs4 utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh); -BSTR_PUBLIC int utf8ScanBackwardsForCodePoint(unsigned char *msg, int len, - int pos, cpUcs4 *out); +BSTR_PUBLIC int utf8ScanBackwardsForCodePoint(const unsigned char *msg, + int len, int pos, cpUcs4 *out); #ifdef __cplusplus } From 1b35a07d0f7a52653c784afa97c05583121c536a Mon Sep 17 00:00:00 2001 From: Daniel Markstedt Date: Tue, 24 Feb 2026 07:56:32 +0100 Subject: [PATCH 3/6] refactor surrogate substitution with flatter control flow --- bstring/buniutil.c | 49 ++++++++++++------- tests/testutf8.c | 115 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+), 17 deletions(-) diff --git a/bstring/buniutil.c b/bstring/buniutil.c index ceafae7..fa6d37e 100644 --- a/bstring/buniutil.c +++ b/bstring/buniutil.c @@ -289,36 +289,51 @@ buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom, } cc = 0; - for (; i < len; i++) { - cpUcs4 c; + while (i < len) { cpUcs4 v; + int invalid = 0; + v = endSwap(utf16[i], sm); + i++; if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */ - if (v >= 0xDC00 || i >= len) { - ErrMode:; - if (~0 == errCh) { - ErrReturn:; - bu->slen = oldSlen; - return BSTR_ERR; - } - v = errCh; + if (v >= 0xDC00) { + invalid = 1; /* Isolated low surrogate */ + } else if (i >= len) { + invalid = 1; /* Unterminated high surrogate */ } else { - i++; - if ((c = endSwap(utf16[i], sm) - 0xDC00) > 0x3FF) - goto ErrMode; - v = ((v - 0xD800) << 10) + c + 0x10000; + cpUcs4 c = endSwap(utf16[i], sm); + if (c < 0xDC00 || c > 0xDFFF) { + invalid = 1; + } else { + i++; + v = ((v - 0xD800) << 10) + (c - 0xDC00) + 0x10000; + } + } + } + + if (invalid) { + if (~0 == errCh) { + bu->slen = oldSlen; + return BSTR_ERR; } + v = errCh; } + buff[cc] = v; cc++; if (cc >= TEMP_UCS4_BUFFER_SIZE) { - if (0 > buAppendBlkUcs4(bu, buff, cc, errCh)) - goto ErrReturn; + if (0 > buAppendBlkUcs4(bu, buff, cc, errCh)) { + bu->slen = oldSlen; + return BSTR_ERR; + } cc = 0; } } - if (cc > 0 && 0 > buAppendBlkUcs4(bu, buff, cc, errCh)) goto ErrReturn; + if (cc > 0 && 0 > buAppendBlkUcs4(bu, buff, cc, errCh)) { + bu->slen = oldSlen; + return BSTR_ERR; + } return BSTR_OK; } diff --git a/tests/testutf8.c b/tests/testutf8.c index 1e28d17..253ba84 100644 --- a/tests/testutf8.c +++ b/tests/testutf8.c @@ -637,6 +637,120 @@ START_TEST(core_010) ck_assert_int_eq(b->data[0], 'A'); bdestroy(b); } + + /* Invalid low surrogate alone with valid errCh → substituted */ + { + cpUcs2 u[] = { 0xDC00 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 1, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], '?'); + bdestroy(b); + } + + /* Invalid low surrogate then ASCII with valid errCh */ + { + cpUcs2 u[] = { 0xDC00, 0x0041 }; + unsigned char expected[] = { '?', 'A' }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 2); + ret = memcmp(b->data, expected, 2); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* Invalid surrogate with invalid errCh → BSTR_ERR and rollback */ + { + cpUcs2 u[] = { 0xDC00 }; + b = bfromcstr("pre"); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 1, NULL, 0xD800); /* invalid errCh */ + ck_assert_int_eq(ret, BSTR_ERR); + ck_assert_int_eq(b->slen, 3); /* unchanged */ + ck_assert_int_eq(b->data[0], 'p'); + ck_assert_int_eq(b->data[1], 'r'); + ck_assert_int_eq(b->data[2], 'e'); + bdestroy(b); + } + + /* bom out-parameter gets set when BOM appears in stream */ + { + cpUcs2 in_bom = 0; + cpUcs2 u[] = { 0xFEFF, 0x0041 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, &in_bom, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(in_bom, 0xFEFF); + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], 'A'); + bdestroy(b); + } + + /* Pre-seeded bom controls endianness even without BOM in input */ + { + cpUcs2 in_bom = 0xFFFE; + cpUcs2 u[] = { 0x4100 }; /* bytes for 0x0041 in opposite endian */ + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 1, &in_bom, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(in_bom, 0xFFFE); /* preserved */ + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], 'A'); + bdestroy(b); + } + + /* Larger than TEMP_UCS4_BUFFER_SIZE exercises internal flush path */ + { + cpUcs2 u[80]; + for (int j = 0; j < 80; j++) { + u[j] = (cpUcs2)('A' + (j % 26)); + } + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 80, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 80); + for (int j = 0; j < 80; j++) { + ck_assert_int_eq(b->data[j], 'A' + (j % 26)); + } + bdestroy(b); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_011: regression guard + * + * Guard against regressions for: + * high surrogate followed by non-low surrogate. + * + * Expected behavior is: + * first code unit substituted with errCh, second processed normally. + * ----------------------------------------------------------------------- */ +START_TEST(core_011) +{ + bstring b; + int ret; + + { + cpUcs2 u[] = { 0xD83D, 0x0041 }; + unsigned char expected[] = { '?', 'A' }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 2); + ret = memcmp(b->data, expected, 2); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } } END_TEST @@ -658,6 +772,7 @@ main(void) tcase_add_test(core, core_008); tcase_add_test(core, core_009); tcase_add_test(core, core_010); + tcase_add_test(core, core_011); suite_add_tcase(suite, core); /* Run tests */ SRunner *runner = srunner_create(suite); From 4ae9b8a97704de0e18178a1960e610f9b1e6fd89 Mon Sep 17 00:00:00 2001 From: Daniel Markstedt Date: Tue, 24 Feb 2026 19:12:46 +0100 Subject: [PATCH 4/6] fix truncation bounds check bug and use flag for error handling --- bstring/utf8util.c | 143 +++++++++++++++++++++++++++++---------------- tests/testutf8.c | 112 ++++++++++++++++++++++++++++++----- 2 files changed, 192 insertions(+), 63 deletions(-) diff --git a/bstring/utf8util.c b/bstring/utf8util.c index c6e2a80..3ce310f 100644 --- a/bstring/utf8util.c +++ b/bstring/utf8util.c @@ -184,6 +184,7 @@ utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) long v; int i; int ofs; + int invalid; if (NULL == iter || iter->next < 0) return errCh; if (iter->next >= iter->slen) { @@ -197,39 +198,64 @@ utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) iter->error = 0; c = chrs[0]; ofs = 0; + invalid = 0; if (c < 0xC0 || c > 0xFD) { - if (c >= 0x80) goto ErrMode; - v = c; - ofs = 1; + if (c >= 0x80) { + invalid = 1; + } else { + v = c; + ofs = 1; + } } else if (c < 0xE0) { - if (iter->next >= iter->slen + 1) goto ErrMode; - v = (c << 6u) - (0x0C0 << 6u); - c = (unsigned char) ((unsigned) chrs[1] - 0x080); - v += c; - if (c >= 0x40 || v < 0x80) goto ErrMode; - ofs = 2; + if (iter->next + 1 >= iter->slen) { + invalid = 1; + } else { + v = (c << 6u) - (0x0C0 << 6u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + v += c; + if (c >= 0x40 || v < 0x80) { + invalid = 1; + } else { + ofs = 2; + } + } } else if (c < 0xF0) { - if (iter->next >= iter->slen + 2) goto ErrMode; - v = (c << 12) - (0x0E0 << 12u); - c = (unsigned char) ((unsigned) chrs[1] - 0x080); - d = (unsigned char) ((unsigned) chrs[2] - 0x080); - v += (c << 6u) + d; - if ((c|d) >= 0x40 || v < 0x800 || - !isLegalUnicodeCodePoint(v)) goto ErrMode; - ofs = 3; + if (iter->next + 2 >= iter->slen) { + invalid = 1; + } else { + v = (c << 12) - (0x0E0 << 12u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + v += (c << 6u) + d; + if ((c|d) >= 0x40 || v < 0x800 || + !isLegalUnicodeCodePoint(v)) { + invalid = 1; + } else { + ofs = 3; + } + } } else if (c < 0xF8) { - if (iter->next >= iter->slen + 3) goto ErrMode; - v = (c << 18) - (0x0F0 << 18u); - c = (unsigned char) ((unsigned) chrs[1] - 0x080); - d = (unsigned char) ((unsigned) chrs[2] - 0x080); - e = (unsigned char) ((unsigned) chrs[3] - 0x080); - v += (c << 12u) + (d << 6u) + e; - if ((c|d|e) >= 0x40 || v < 0x10000 || - !isLegalUnicodeCodePoint(v)) goto ErrMode; - ofs = 4; + if (iter->next + 3 >= iter->slen) { + invalid = 1; + } else { + v = (c << 18) - (0x0F0 << 18u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + e = (unsigned char) ((unsigned) chrs[3] - 0x080); + v += (c << 12u) + (d << 6u) + e; + if ((c|d|e) >= 0x40 || v < 0x10000 || + !isLegalUnicodeCodePoint(v)) { + invalid = 1; + } else { + ofs = 4; + } + } } else { /* 5 and 6 byte encodings are invalid */ - ErrMode:; + invalid = 1; + } + + if (invalid) { iter->error = 1; v = errCh; for (i = iter->next+1; i < iter->slen; i++) { @@ -261,6 +287,7 @@ utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) unsigned char d; unsigned char e; long v; + int invalid; if (NULL == iter || iter->next < 0) return errCh; if (iter->next >= iter->slen) { @@ -273,35 +300,51 @@ utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) iter->error = 0; c = chrs[0]; + invalid = 0; if (c < 0xC0 || c > 0xFD) { - if (c >= 0x80) goto ErrMode; - v = c; + if (c >= 0x80) { + invalid = 1; + } else { + v = c; + } } else if (c < 0xE0) { - if (iter->next >= iter->slen + 1) goto ErrMode; - v = (c << 6u) - (0x0C0 << 6u); - c = (unsigned char) ((unsigned) chrs[1] - 0x080); - v += c; - if (c >= 0x40 || v < 0x80) goto ErrMode; + if (iter->next + 1 >= iter->slen) { + invalid = 1; + } else { + v = (c << 6u) - (0x0C0 << 6u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + v += c; + if (c >= 0x40 || v < 0x80) invalid = 1; + } } else if (c < 0xF0) { - if (iter->next >= iter->slen + 2) goto ErrMode; - v = (c << 12lu) - (0x0E0 << 12u); - c = (unsigned char) ((unsigned) chrs[1] - 0x080); - d = (unsigned char) ((unsigned) chrs[2] - 0x080); - v += (c << 6u) + d; - if ((c|d) >= 0x40 || v < 0x800 || - !isLegalUnicodeCodePoint(v)) goto ErrMode; + if (iter->next + 2 >= iter->slen) { + invalid = 1; + } else { + v = (c << 12lu) - (0x0E0 << 12u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + v += (c << 6u) + d; + if ((c|d) >= 0x40 || v < 0x800 || + !isLegalUnicodeCodePoint(v)) invalid = 1; + } } else if (c < 0xF8) { - if (iter->next >= iter->slen + 3) goto ErrMode; - v = (c << 18lu) - (0x0F0 << 18u); - c = (unsigned char) ((unsigned) chrs[1] - 0x080); - d = (unsigned char) ((unsigned) chrs[2] - 0x080); - e = (unsigned char) ((unsigned) chrs[3] - 0x080); - v += (c << 12lu) + (d << 6u) + e; - if ((c|d|e) >= 0x40 || v < 0x10000 || - !isLegalUnicodeCodePoint(v)) goto ErrMode; + if (iter->next + 3 >= iter->slen) { + invalid = 1; + } else { + v = (c << 18lu) - (0x0F0 << 18u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + e = (unsigned char) ((unsigned) chrs[3] - 0x080); + v += (c << 12lu) + (d << 6u) + e; + if ((c|d|e) >= 0x40 || v < 0x10000 || + !isLegalUnicodeCodePoint(v)) invalid = 1; + } } else { /* 5 and 6 byte encodings are invalid */ - ErrMode:; + invalid = 1; + } + + if (invalid) { iter->error = 1; v = errCh; } diff --git a/tests/testutf8.c b/tests/testutf8.c index 253ba84..e5545a6 100644 --- a/tests/testutf8.c +++ b/tests/testutf8.c @@ -224,9 +224,93 @@ START_TEST(core_004) END_TEST /* ----------------------------------------------------------------------- - * core_005: utf8IteratorGetCurrCodePoint — peek without advancing + * core_005: utf8IteratorGetNextCodePoint — truncated sequence bounds checks + * + * The backing arrays contain full valid code points, but slen is set so the + * sequence is truncated at the end. Iterator must treat each as invalid and + * return errCh instead of decoding bytes past slen. * ----------------------------------------------------------------------- */ START_TEST(core_005) +{ + struct utf8Iterator iter; + cpUcs4 cp; + + { + unsigned char data[] = { 0xC2, 0xA9 }; + utf8IteratorInit(&iter, data, 1); + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.start, 0); + ck_assert_int_eq(iter.next, 1); + } + + { + unsigned char data[] = { 0xE2, 0x82, 0xAC }; + utf8IteratorInit(&iter, data, 2); + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.start, 0); + ck_assert_int_eq(iter.next, 2); + } + + { + unsigned char data[] = { 0xF0, 0x9F, 0x98, 0x80 }; + utf8IteratorInit(&iter, data, 3); + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.start, 0); + ck_assert_int_eq(iter.next, 3); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_006: utf8IteratorGetCurrCodePoint — truncated sequence bounds checks + * + * Peek must never decode bytes beyond slen. For each truncated sequence, it + * should return errCh, set iter.error, and leave iter.next unchanged. + * ----------------------------------------------------------------------- */ +START_TEST(core_006) +{ + struct utf8Iterator iter; + cpUcs4 cp; + + { + unsigned char data[] = { 0xC2, 0xA9 }; + utf8IteratorInit(&iter, data, 1); + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.next, 0); + } + + { + unsigned char data[] = { 0xE2, 0x82, 0xAC }; + utf8IteratorInit(&iter, data, 2); + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.next, 0); + } + + { + unsigned char data[] = { 0xF0, 0x9F, 0x98, 0x80 }; + utf8IteratorInit(&iter, data, 3); + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.next, 0); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_007: utf8IteratorGetCurrCodePoint — peek without advancing + * ----------------------------------------------------------------------- */ +START_TEST(core_007) { struct utf8Iterator iter; unsigned char data[] = { 0xC2, 0xA9, 0x41 }; /* © A */ @@ -258,11 +342,11 @@ START_TEST(core_005) END_TEST /* ----------------------------------------------------------------------- - * core_006: utf8ScanBackwardsForCodePoint — various positions + * core_008: utf8ScanBackwardsForCodePoint — various positions * * Data: © (0xC2 0xA9) at bytes 0-1, then 'A' (0x41) at byte 2 * ----------------------------------------------------------------------- */ -START_TEST(core_006) +START_TEST(core_008) { unsigned char data[] = { 0xC2, 0xA9, 0x41 }; /* © A */ cpUcs4 out; @@ -298,9 +382,9 @@ START_TEST(core_006) END_TEST /* ----------------------------------------------------------------------- - * core_007: buIsUTF8Content + * core_009: buIsUTF8Content * ----------------------------------------------------------------------- */ -START_TEST(core_007) +START_TEST(core_009) { bstring b; int ret; @@ -360,9 +444,9 @@ START_TEST(core_007) END_TEST /* ----------------------------------------------------------------------- - * core_008: buAppendBlkUcs4 — UCS-4 array → UTF-8 bstring + * core_010: buAppendBlkUcs4 — UCS-4 array → UTF-8 bstring * ----------------------------------------------------------------------- */ -START_TEST(core_008) +START_TEST(core_010) { bstring b; int ret; @@ -458,9 +542,9 @@ START_TEST(core_008) END_TEST /* ----------------------------------------------------------------------- - * core_009: buGetBlkUTF16 — UTF-8 bstring → UTF-16 array + * core_011: buGetBlkUTF16 — UTF-8 bstring → UTF-16 array * ----------------------------------------------------------------------- */ -START_TEST(core_009) +START_TEST(core_011) { cpUcs2 buf[16]; int ret; @@ -537,9 +621,9 @@ START_TEST(core_009) END_TEST /* ----------------------------------------------------------------------- - * core_010: buAppendBlkUTF16 — UTF-16 array → UTF-8 bstring + * core_012: buAppendBlkUTF16 — UTF-16 array → UTF-8 bstring * ----------------------------------------------------------------------- */ -START_TEST(core_010) +START_TEST(core_012) { bstring b; int ret; @@ -726,7 +810,7 @@ START_TEST(core_010) END_TEST /* ----------------------------------------------------------------------- - * core_011: regression guard + * core_013: regression guard * * Guard against regressions for: * high surrogate followed by non-low surrogate. @@ -734,7 +818,7 @@ END_TEST * Expected behavior is: * first code unit substituted with errCh, second processed normally. * ----------------------------------------------------------------------- */ -START_TEST(core_011) +START_TEST(core_013) { bstring b; int ret; @@ -773,6 +857,8 @@ main(void) tcase_add_test(core, core_009); tcase_add_test(core, core_010); tcase_add_test(core, core_011); + tcase_add_test(core, core_012); + tcase_add_test(core, core_013); suite_add_tcase(suite, core); /* Run tests */ SRunner *runner = srunner_create(suite); From 2ed0053d984aea3ea3e93f9642beea6e914e12b0 Mon Sep 17 00:00:00 2001 From: Daniel Markstedt Date: Tue, 24 Feb 2026 20:08:45 +0100 Subject: [PATCH 5/6] return when encountering invalid continuation bytes --- bstring/utf8util.c | 7 ++++++- tests/testutf8.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/bstring/utf8util.c b/bstring/utf8util.c index 3ce310f..d8a69f7 100644 --- a/bstring/utf8util.c +++ b/bstring/utf8util.c @@ -97,7 +97,6 @@ utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos, return 0; } else if (msg[pos] < 0xC0) { if (0 == pos) return -__LINE__; - ret = -__LINE__; if (msg[pos-1] >= 0xC1 && msg[pos-1] < 0xF8) { pos--; ret = 1; @@ -119,6 +118,7 @@ utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos, } if (msg[pos] < 0xE0) { if (pos + 1 >= len) return -__LINE__; + if ((msg[pos+1] & 0xC0) != 0x80) return -__LINE__; v1 = msg[pos] & ~0xE0; v2 = msg[pos+1] & ~0xC0; v1 = (v1 << 6) + v2; @@ -128,6 +128,8 @@ utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos, } if (msg[pos] < 0xF0) { if (pos + 2 >= len) return -__LINE__; + if ((msg[pos+1] & 0xC0) != 0x80) return -__LINE__; + if ((msg[pos+2] & 0xC0) != 0x80) return -__LINE__; v1 = msg[pos] & ~0xF0; v2 = msg[pos+1] & ~0xC0; v3 = msg[pos+2] & ~0xC0; @@ -141,6 +143,9 @@ utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos, if (msg[pos] >= 0xF8) return -__LINE__; if (pos + 3 >= len) return -__LINE__; + if ((msg[pos+1] & 0xC0) != 0x80) return -__LINE__; + if ((msg[pos+2] & 0xC0) != 0x80) return -__LINE__; + if ((msg[pos+3] & 0xC0) != 0x80) return -__LINE__; v1 = msg[pos] & ~0xF8; v2 = msg[pos+1] & ~0xC0; v3 = msg[pos+2] & ~0xC0; diff --git a/tests/testutf8.c b/tests/testutf8.c index e5545a6..792f737 100644 --- a/tests/testutf8.c +++ b/tests/testutf8.c @@ -838,6 +838,40 @@ START_TEST(core_013) } END_TEST +/* ----------------------------------------------------------------------- + * core_013: utf8ScanBackwardsForCodePoint — invalid continuation bytes + * + * Each case starts at a lead byte but includes one or more non-continuation + * trailing bytes. Scanner must reject these and return an error. + * ----------------------------------------------------------------------- */ +START_TEST(core_014) +{ + cpUcs4 out = 0; + int ret; + + /* Invalid 2-byte sequence: second byte must be 10xxxxxx */ + { + unsigned char data[] = { 0xC2, 0x41 }; + ret = utf8ScanBackwardsForCodePoint(data, 2, 0, &out); + ck_assert(ret < 0); + } + + /* Invalid 3-byte sequence: middle byte must be 10xxxxxx */ + { + unsigned char data[] = { 0xE2, 0x28, 0xAC }; + ret = utf8ScanBackwardsForCodePoint(data, 3, 0, &out); + ck_assert(ret < 0); + } + + /* Invalid 4-byte sequence: third byte must be 10xxxxxx */ + { + unsigned char data[] = { 0xF0, 0x9F, 0x41, 0x80 }; + ret = utf8ScanBackwardsForCodePoint(data, 4, 0, &out); + ck_assert(ret < 0); + } +} +END_TEST + int main(void) { @@ -859,6 +893,7 @@ main(void) tcase_add_test(core, core_011); tcase_add_test(core, core_012); tcase_add_test(core, core_013); + tcase_add_test(core, core_014); suite_add_tcase(suite, core); /* Run tests */ SRunner *runner = srunner_create(suite); From 01c0a19312569b805ffaf96aa6d4f97f36f4c70c Mon Sep 17 00:00:00 2001 From: Daniel Markstedt Date: Tue, 24 Feb 2026 20:14:59 +0100 Subject: [PATCH 6/6] uppercase integer literal notation to prevent ambiguity --- bstring/utf8util.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bstring/utf8util.c b/bstring/utf8util.c index d8a69f7..8362d25 100644 --- a/bstring/utf8util.c +++ b/bstring/utf8util.c @@ -326,7 +326,7 @@ utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) if (iter->next + 2 >= iter->slen) { invalid = 1; } else { - v = (c << 12lu) - (0x0E0 << 12u); + v = (c << 12UL) - (0x0E0 << 12u); c = (unsigned char) ((unsigned) chrs[1] - 0x080); d = (unsigned char) ((unsigned) chrs[2] - 0x080); v += (c << 6u) + d; @@ -337,11 +337,11 @@ utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) if (iter->next + 3 >= iter->slen) { invalid = 1; } else { - v = (c << 18lu) - (0x0F0 << 18u); + v = (c << 18UL) - (0x0F0 << 18u); c = (unsigned char) ((unsigned) chrs[1] - 0x080); d = (unsigned char) ((unsigned) chrs[2] - 0x080); e = (unsigned char) ((unsigned) chrs[3] - 0x080); - v += (c << 12lu) + (d << 6u) + e; + v += (c << 12UL) + (d << 6u) + e; if ((c|d|e) >= 0x40 || v < 0x10000 || !isLegalUnicodeCodePoint(v)) invalid = 1; }