From b44af78d04a519c1bfd8311ea009a063e30e67f5 Mon Sep 17 00:00:00 2001
From: Daniel Markstedt <daniel@mindani.net>
Date: Mon, 23 Feb 2026 23:14:18 +0100
Subject: [PATCH 1/6] port UTF8 string manipulation modules from bstrlib

Port the UTF-8 string manipulation modules from bstrlib to this fork. Credits to Paul Hsieh.

utf8util is a standalone low-level module providing a forward iterator over UTF-8 byte sequences (utf8IteratorInit, utf8IteratorGetNextCodePoint, utf8IteratorGetCurrCodePoint, utf8ScanBackwardsForCodePoint) along with the cpUcs4/cpUcs2 type definitions and the isLegalUnicodeCodePoint macro.

buniutil builds on top of it and bstrlib to provide four higher-level functions: buIsUTF8Content, buAppendBlkUcs4, buGetBlkUTF16, and buAppendBlkUTF16. Both modules are compiled into the main libbstring binary, enabled by default and controlled by the new enable-utf8 build option.

Two adaptations were made to fit bstring's conventions: const_bstring was replaced with const bstring throughout (bstring dropped that typedef), and BSTR_PUBLIC visibility attributes were added to all public declarations.

A new test module tests/testutf8.c was written from scratch, covering the full API surface including ASCII and multi-byte iteration, error recovery, surrogate pair encoding/decoding, BOM handling, and null/invalid-argument guards.
---
 README.md           |   4 +-
 bstring/buniutil.c  | 317 +++++++++++++++++++++
 bstring/buniutil.h  | 100 +++++++
 bstring/meson.build |  12 +-
 bstring/utf8util.c  | 300 ++++++++++++++++++++
 bstring/utf8util.h  | 106 +++++++
 doc/introduction.md |  26 +-
 meson_options.txt   |   6 +
 tests/meson.build   |  12 +
 tests/testutf8.c    | 668 ++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 1546 insertions(+), 5 deletions(-)
 create mode 100644 bstring/buniutil.c
 create mode 100644 bstring/buniutil.h
 create mode 100644 bstring/utf8util.c
 create mode 100644 bstring/utf8util.h
 create mode 100644 tests/testutf8.c

diff --git a/README.md b/README.md
index af6d18b..a77ba33 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@ Among the features achieved are:
   expect '\0' terminated char buffers
 - Improved overall performance of common string operations
 - Functional equivalency with other more modern languages
+- Optional API for manipulating UTF-8 encoded strings
 
 ## bstring fork
 
@@ -27,8 +28,7 @@ features (or mis-features, depending on your point of view) are included:
 2. Improved test suite using the [Check][] library
 3. Continuous integration via GitHub Actions, including memory profiling with [Valgrind][]
 4. Remove C++ wrapper code, returning this to a pure C library
-5. No UTF8 string manipulation support
-6. Documentation generation with [Doxygen][]
+5. Documentation generation with [Doxygen][]
 
 Currently this fork should be binary-compatible with the original code. The
 only source incompatibility is the removal of the `const_bstring` type.
diff --git a/bstring/buniutil.c b/bstring/buniutil.c
new file mode 100644
index 0000000..004a061
--- /dev/null
+++ b/bstring/buniutil.c
@@ -0,0 +1,317 @@
+/* Copyright 2002-2015 Paul Hsieh
+ * This file is part of Bstrlib.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ *    3. Neither the name of bstrlib nor the names of its contributors may be
+ *       used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * GNU General Public License Version 2 (the "GPL").
+ */
+
+/*
+ * buniutil.c
+ *
+ * This file is not necessarily part of the core bstring library itself, but
+ * is an implementation of basic UTF-8 processing for bstrings.  This module
+ * depends on bstrlib.c and utf8util.c.
+ */
+
+#include "bstrlib.h"
+#include "buniutil.h"
+
+#define UNICODE__CODE_POINT__REPLACEMENT_CHARACTER (0xFFFDL)
+
+/*  int buIsUTF8Content (const bstring bu)
+ *
+ *  Scan string and return 1 if its entire contents is entirely UTF-8 code
+ *  points.  Otherwise return 0.
+ */
+int
+buIsUTF8Content(const bstring bu)
+{
+	struct utf8Iterator iter;
+
+	if (NULL == bdata(bu)) return 0;
+	for (utf8IteratorInit(&iter, bu->data, bu->slen);
+	     iter.next < iter.slen;) {
+		if (0 >= utf8IteratorGetNextCodePoint(&iter, -1)) return 0;
+	}
+	return 1;
+}
+
+/*  int buGetBlkUTF16 (cpUcs2 *ucs2, int len, cpUcs4 errCh,
+ *                     const bstring bu, int pos)
+ *
+ *  Convert a string of UTF-8 code points (bu) skipping the first pos code
+ *  points, into a sequence of UTF-16 encoded code points.  Returns the
+ *  number of UCS-2 16-bit words written to the output.  No more than len
+ *  words are written to the target array ucs2.  If any code point in bu is
+ *  unparsable, it will be translated to errCh.
+ */
+int
+buGetBlkUTF16(/* @out */ cpUcs2 *ucs2, int len, cpUcs4 errCh,
+              const bstring bu, int pos)
+{
+	struct tagbstring t;
+	struct utf8Iterator iter;
+	cpUcs4 ucs4;
+	int i, j;
+
+	if (!isLegalUnicodeCodePoint(errCh))
+		errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
+	if (NULL == ucs2 || 0 >= len || NULL == bdata(bu) || 0 > pos)
+		return BSTR_ERR;
+
+	for (j=0, i=0; j < bu->slen; j++) {
+		if (0x80 != (0xC0 & bu->data[j])) {
+			if (i >= pos) break;
+			i++;
+		}
+	}
+
+	t.mlen = -1;
+	t.data = bu->data + j;
+	t.slen = bu->slen - j;
+
+	utf8IteratorInit(&iter, t.data, t.slen);
+
+	ucs4 = BSTR_ERR;
+	for (i=0; 0 < len && iter.next < iter.slen &&
+	          0 <= (ucs4 = utf8IteratorGetNextCodePoint(&iter, errCh));
+	     i++) {
+		if (ucs4 < 0x10000) {
+			*ucs2++ = (cpUcs2) ucs4;
+			len--;
+		} else {
+			if (len < 2) {
+				*ucs2++ = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
+				len--;
+			} else {
+				long y = ucs4 - 0x10000;
+				ucs2[0] = (cpUcs2) (0xD800 | (y >> 10));
+				ucs2[1] = (cpUcs2) (0xDC00 | (y & 0x03FF));
+				len -= 2;
+				ucs2 += 2;
+				i++;
+			}
+		}
+	}
+	while (0 < len) {
+		*ucs2++ = 0;
+		len--;
+	}
+
+	utf8IteratorUninit(&iter);
+	if (0 > ucs4) return BSTR_ERR;
+	return i;
+}
+
+/*
+
+Unicode                   UTF-8
+-------                   -----
+U-00000000 - U-0000007F:  0xxxxxxx
+U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
+U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
+U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+UTF-32: U-000000 - U-10FFFF
+
+*/
+
+/*  int buAppendBlkUcs4 (bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh)
+ *
+ *  Convert an array of UCS-4 code points (bu) to UTF-8 code points and
+ *  append to b.  Any invalid code point is replaced by errCh.  If errCh is
+ *  itself not a valid code point, then this translation will halt upon the
+ *  first error and return BSTR_ERR.  Otherwise BSTR_OK is returned.
+ */
+int
+buAppendBlkUcs4(bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh)
+{
+	int i, oldSlen;
+
+	if (NULL == bu || NULL == b || 0 > len ||
+	    0 > (oldSlen = blengthe(b, -1))) return BSTR_ERR;
+	if (!isLegalUnicodeCodePoint(errCh)) errCh = ~0;
+
+	for (i=0; i < len; i++) {
+		unsigned char c[6];
+		cpUcs4 v = bu[i];
+
+		if (!isLegalUnicodeCodePoint(v)) {
+			if (~0 == errCh) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+			v = errCh;
+		}
+
+		if (v < 0x80) {
+			if (BSTR_OK != bconchar(b, (char) v)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else if (v < 0x800) {
+			c[0] = (unsigned char) ( (v >>  6)         + 0xc0);
+			c[1] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk(b, c, 2)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else if (v < 0x10000) {
+			c[0] = (unsigned char) ( (v >> 12)         + 0xe0);
+			c[1] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[2] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk(b, c, 3)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else
+#if 0
+			if (v < 0x200000)
+#endif
+		{
+			c[0] = (unsigned char) ( (v >> 18)         + 0xf0);
+			c[1] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
+			c[2] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[3] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk(b, c, 4)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		}
+#if 0
+		else if (v < 0x4000000) {
+			c[0] = (unsigned char) ( (v >> 24)         + 0xf8);
+			c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
+			c[2] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
+			c[3] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[4] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk(b, c, 5)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else {
+			c[0] = (unsigned char) ( (v >> 30)         + 0xfc);
+			c[1] = (unsigned char) (((v >> 24) & 0x3f) + 0x80);
+			c[2] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
+			c[3] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
+			c[4] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[5] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk(b, c, 6)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		}
+#endif
+	}
+	return BSTR_OK;
+}
+
+#define endSwap(cs, mode) \
+	((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs))
+#define TEMP_UCS4_BUFFER_SIZE (64)
+
+/*  int buAppendBlkUTF16 (bstring bu, const cpUcs2 *utf16, int len,
+ *                        cpUcs2 *bom, cpUcs4 errCh)
+ *
+ *  Append an array of UCS-2 code units (utf16) as UTF-8 to bstring bu.
+ *  Any invalid code point is replaced by errCh.  If errCh is itself not a
+ *  valid code point, then this translation will halt upon the first error
+ *  and return BSTR_ERR.  Otherwise BSTR_OK is returned.  If a byte order
+ *  mark has been previously read, it may be passed in as bom, otherwise if
+ *  *bom is set to 0, it will be filled in with the BOM as read from the
+ *  first character if it is a BOM.
+ */
+int
+buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom,
+                 cpUcs4 errCh)
+{
+	cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE];
+	int cc, i, sm, oldSlen;
+
+	if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR;
+	if (!isLegalUnicodeCodePoint(errCh)) errCh = ~0;
+	if (len == 0) return BSTR_OK;
+
+	oldSlen = bu->slen;
+	i = 0;
+
+	/* Check for BOM character and select endianness.  Also remove the
+	   BOM from the stream, since there is no need for it in UTF-8. */
+	if (bom && (cpUcs2) 0xFFFE == *bom) {
+		sm = 8;
+	} else if (bom && (cpUcs2) 0xFEFF == *bom) {
+		sm = 0;
+	} else if (utf16[i] == (cpUcs2) 0xFFFE) {
+		if (bom) *bom = utf16[i];
+		sm = 8;
+		i++;
+	} else if (utf16[i] == (cpUcs2) 0xFEFF) {
+		if (bom) *bom = utf16[i];
+		sm = 0;
+		i++;
+	} else {
+		sm = 0; /* Assume local endianness. */
+	}
+
+	cc = 0;
+	for (; i < len; i++) {
+		cpUcs4 c, v;
+		v = endSwap(utf16[i], sm);
+
+		if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */
+			if (v >= 0xDC00 || i >= len) {
+			ErrMode:;
+				if (~0 == errCh) {
+				ErrReturn:;
+					bu->slen = oldSlen;
+					return BSTR_ERR;
+				}
+				v = errCh;
+			} else {
+				i++;
+				if ((c = endSwap(utf16[i], sm) - 0xDC00) > 0x3FF)
+					goto ErrMode;
+				v = ((v - 0xD800) << 10) + c + 0x10000;
+			}
+		}
+		buff[cc] = v;
+		cc++;
+		if (cc >= TEMP_UCS4_BUFFER_SIZE) {
+			if (0 > buAppendBlkUcs4(bu, buff, cc, errCh))
+				goto ErrReturn;
+			cc = 0;
+		}
+	}
+	if (cc > 0 && 0 > buAppendBlkUcs4(bu, buff, cc, errCh)) goto ErrReturn;
+
+	return BSTR_OK;
+}
diff --git a/bstring/buniutil.h b/bstring/buniutil.h
new file mode 100644
index 0000000..7114991
--- /dev/null
+++ b/bstring/buniutil.h
@@ -0,0 +1,100 @@
+/* Copyright 2002-2015 Paul Hsieh
+ * This file is part of Bstrlib.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ *    3. Neither the name of bstrlib nor the names of its contributors may be
+ *       used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * GNU General Public License Version 2 (the "GPL").
+ */
+
+/**
+ * \file
+ * \brief Interface for basic Unicode utility functions for bstrings.
+ *
+ * Depends on bstrlib.h and utf8util.h.
+ */
+
+#ifndef BSTRLIB_UNICODE_UTILITIES
+#define BSTRLIB_UNICODE_UTILITIES
+
+#include "bstrlib.h"
+#include "utf8util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Scan a bstring and return 1 if its entire content consists of valid UTF-8
+ * encoded code points, otherwise return 0.
+ */
+BSTR_PUBLIC int
+buIsUTF8Content(const bstring bu);
+
+/**
+ * Convert an array of UCS-4 code points (bu, len elements) to UTF-8 and
+ * append the result to the bstring b.
+ *
+ * Any invalid code point is replaced by errCh. If errCh is itself not a
+ * valid code point, translation halts on the first error and BSTR_ERR is
+ * returned. Otherwise BSTR_OK is returned.
+ */
+BSTR_PUBLIC int
+buAppendBlkUcs4(bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh);
+
+/* For those unfortunate enough to be stuck supporting UTF-16. */
+
+/**
+ * Convert the UTF-8 bstring bu (starting at code-point offset pos) to a
+ * sequence of UTF-16 encoded code units written to ucs2 (at most len units).
+ *
+ * Returns the number of UCS-2 16-bit words written. Any unparsable code
+ * point is translated to errCh.
+ */
+BSTR_PUBLIC int
+buGetBlkUTF16(/* @out */ cpUcs2 *ucs2, int len, cpUcs4 errCh,
+              const bstring bu, int pos);
+
+/**
+ * Append an array of UTF-16 code units (utf16, len elements) to the UTF-8
+ * bstring bu.
+ *
+ * Any invalid code point is replaced by errCh. If errCh is itself not a
+ * valid code point, translation halts on the first error and BSTR_ERR is
+ * returned. Otherwise BSTR_OK is returned. If a byte order mark has been
+ * previously read it may be passed in via bom; if *bom is 0 it will be
+ * filled in from the first character if it is a BOM.
+ */
+BSTR_PUBLIC int
+buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom,
+                 cpUcs4 errCh);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BSTRLIB_UNICODE_UTILITIES */
diff --git a/bstring/meson.build b/bstring/meson.build
index 01a8043..bb46e8c 100644
--- a/bstring/meson.build
+++ b/bstring/meson.build
@@ -1,8 +1,16 @@
-install_headers(['bstraux.h', 'bstrlib.h'])
+bstring_sources = ['bstraux.c', 'bstrlib.c']
+bstring_headers = ['bstraux.h', 'bstrlib.h']
+
+if get_option('enable-utf8')
+    bstring_sources += ['buniutil.c', 'utf8util.c']
+    bstring_headers += ['buniutil.h', 'utf8util.h']
+endif
+
+install_headers(bstring_headers)
 
 libbstring = library(
     meson.project_name(),
-    ['bstraux.c', 'bstrlib.c'],
+    bstring_sources,
     version: meson.project_version(),
     soversion: '1',
     include_directories: bstring_inc,
diff --git a/bstring/utf8util.c b/bstring/utf8util.c
new file mode 100644
index 0000000..48cf540
--- /dev/null
+++ b/bstring/utf8util.c
@@ -0,0 +1,300 @@
+/* Copyright 2002-2015 Paul Hsieh
+ * This file is part of Bstrlib.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ *    3. Neither the name of bstrlib nor the names of its contributors may be
+ *       used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * GNU General Public License Version 2 (the "GPL").
+ */
+
+/*
+ * utf8util.c
+ *
+ * This file is not necessarily part of the core bstring library itself, but
+ * is a generic module for implementing UTF-8 utility functions.
+ */
+
+#include "utf8util.h"
+
+#ifndef NULL
+#ifdef __cplusplus
+#define NULL 0
+#else
+#define NULL ((void *)0)
+#endif
+#endif
+
+/* Surrogate range is wrong, there is a maximum, the BOM alias is illegal
+   and 0xFFFF is illegal */
+#define isLegalUnicodeCodePoint(v) \
+	((((v) < 0xD800L) || ((v) > 0xDFFFL)) && \
+	 (((unsigned long)(v)) <= 0x0010FFFFL) && \
+	 (((v)|0x1F0001) != 0x1FFFFFL))
+
+void
+utf8IteratorInit(struct utf8Iterator *iter, unsigned char *data, int slen)
+{
+	if (iter) {
+		iter->data  = data;
+		iter->slen  = (iter->data && slen >= 0) ? slen : -1;
+		iter->start = -1;
+		iter->next  = (iter->slen >= 0) ? 0 : -1;
+		iter->error = (iter->slen >= 0) ? 0 : 1;
+	}
+}
+
+void
+utf8IteratorUninit(struct utf8Iterator *iter)
+{
+	if (iter) {
+		iter->data  = NULL;
+		iter->slen  = -1;
+		iter->start = iter->next = -1;
+	}
+}
+
+int
+utf8ScanBackwardsForCodePoint(unsigned char *msg, int len, int pos,
+                               cpUcs4 *out)
+{
+	cpUcs4 v1, v2, v3, v4, x;
+	int ret;
+	if (NULL == msg || len < 0 || (unsigned) pos >= (unsigned) len) {
+		return -__LINE__;
+	}
+	if (!out) out = &x;
+	ret = 0;
+	if (msg[pos] < 0x80) {
+		*out = msg[pos];
+		return 0;
+	} else if (msg[pos] < 0xC0) {
+		if (0 == pos) return -__LINE__;
+		ret = -__LINE__;
+		if (msg[pos-1] >= 0xC1 && msg[pos-1] < 0xF8) {
+			pos--;
+			ret = 1;
+		} else {
+			if (1 == pos) return -__LINE__;
+			if ((msg[pos-1] | 0x3F) != 0xBF) return -__LINE__;
+			if (msg[pos-2] >= 0xE0 && msg[pos-2] < 0xF8) {
+				pos -= 2;
+				ret = 2;
+			} else {
+				if (2 == pos) return -__LINE__;
+				if ((msg[pos-2] | 0x3F) != 0xBF) return -__LINE__;
+				if ((msg[pos-3]|0x07) == 0xF7) {
+					pos -= 3;
+					ret = 3;
+				} else return -__LINE__;
+			}
+		}
+	}
+	if (msg[pos] < 0xE0) {
+		if (pos + 1 >= len) return -__LINE__;
+		v1 = msg[pos]   & ~0xE0;
+		v2 = msg[pos+1] & ~0xC0;
+		v1 = (v1 << 6) + v2;
+		if (v1 < 0x80) return -__LINE__;
+		*out = v1;
+		return ret;
+	}
+	if (msg[pos] < 0xF0) {
+		if (pos + 2 >= len) return -__LINE__;
+		v1 = msg[pos]   & ~0xF0;
+		v2 = msg[pos+1] & ~0xC0;
+		v3 = msg[pos+2] & ~0xC0;
+		v1 = (v1 << 12) + (v2 << 6) + v3;
+		if (v1 < 0x800) return -__LINE__;
+		if (!isLegalUnicodeCodePoint(v1)) return -__LINE__;
+		*out = v1;
+		return ret;
+	}
+
+	if (msg[pos] >= 0xF8) return -__LINE__;
+
+	if (pos + 3 >= len) return -__LINE__;
+	v1 = msg[pos]   & ~0xF8;
+	v2 = msg[pos+1] & ~0xC0;
+	v3 = msg[pos+2] & ~0xC0;
+	v4 = msg[pos+3] & ~0xC0;
+	v1 = (v1 << 18) + (v2 << 12) + (v3 << 6) + v4;
+	if (v1 < 0x10000) return -__LINE__;
+	if (!isLegalUnicodeCodePoint(v1)) return -__LINE__;
+	*out = v1;
+	return ret;
+}
+
+/*
+Code point                UTF-8
+----------                -----
+U-00000000 - U-0000007F:  0xxxxxxx
+U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
+U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
+U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+*/
+
+/*
+ *  Returns next read code point for iterator.
+ *
+ *  iter->data + iter->start points at the characters just read.
+ *
+ *  iter->data + iter->next points at the characters that will be read next.
+ *
+ *  iter->error is boolean indicating whether or not last read contained
+ *  an error.
+ */
+cpUcs4
+utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
+{
+	unsigned char *chrs;
+	unsigned char c, d, e;
+	long v;
+	int i, ofs;
+
+	if (NULL == iter || iter->next < 0) return errCh;
+	if (iter->next >= iter->slen) {
+		iter->start = iter->slen;
+		return errCh;
+	}
+	if (NULL == iter->data || iter->next < 0 ||
+	    utf8IteratorNoMore(iter)) return errCh;
+	chrs = iter->data + iter->next;
+
+	iter->error = 0;
+	c = chrs[0];
+	ofs = 0;
+
+	if (c < 0xC0 || c > 0xFD) {
+		if (c >= 0x80) goto ErrMode;
+		v = c;
+		ofs = 1;
+	} else if (c < 0xE0) {
+		if (iter->next >= iter->slen + 1) goto ErrMode;
+		v = (c << 6u) - (0x0C0 << 6u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		v += c;
+		if (c >= 0x40 || v < 0x80) goto ErrMode;
+		ofs = 2;
+	} else if (c < 0xF0) {
+		if (iter->next >= iter->slen + 2) goto ErrMode;
+		v = (c << 12) - (0x0E0 << 12u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+		v += (c << 6u) + d;
+		if ((c|d) >= 0x40 || v < 0x800 ||
+		    !isLegalUnicodeCodePoint(v)) goto ErrMode;
+		ofs = 3;
+	} else if (c < 0xF8) {
+		if (iter->next >= iter->slen + 3) goto ErrMode;
+		v = (c << 18) - (0x0F0 << 18u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+		e = (unsigned char) ((unsigned) chrs[3] - 0x080);
+		v += (c << 12u) + (d << 6u) + e;
+		if ((c|d|e) >= 0x40 || v < 0x10000 ||
+		    !isLegalUnicodeCodePoint(v)) goto ErrMode;
+		ofs = 4;
+	} else { /* 5 and 6 byte encodings are invalid */
+	ErrMode:;
+		iter->error = 1;
+		v = errCh;
+		for (i = iter->next+1; i < iter->slen; i++) {
+			if ((iter->data[i] & 0xC0) != 0x80) break;
+		}
+		ofs = i - iter->next;
+	}
+
+	iter->start = iter->next;
+	iter->next += ofs;
+	return v;
+}
+
+/*
+ *  Returns current code point for iterator without advancing.
+ *
+ *  iter->data + iter->start points at the characters to be read.
+ *
+ *  iter->data + iter->next points at the characters that will be read next.
+ *
+ *  iter->error is boolean indicating whether or not last read contained
+ *  an error.
+ */
+cpUcs4
+utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
+{
+	unsigned char *chrs;
+	unsigned char c, d, e;
+	long v;
+
+	if (NULL == iter || iter->next < 0) return errCh;
+	if (iter->next >= iter->slen) {
+		iter->start = iter->slen;
+		return errCh;
+	}
+	if (NULL == iter->data || iter->next < 0 ||
+	    utf8IteratorNoMore(iter)) return errCh;
+	chrs = iter->data + iter->next;
+
+	iter->error = 0;
+	c = chrs[0];
+
+	if (c < 0xC0 || c > 0xFD) {
+		if (c >= 0x80) goto ErrMode;
+		v = c;
+	} else if (c < 0xE0) {
+		if (iter->next >= iter->slen + 1) goto ErrMode;
+		v = (c << 6u) - (0x0C0 << 6u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		v += c;
+		if (c >= 0x40 || v < 0x80) goto ErrMode;
+	} else if (c < 0xF0) {
+		if (iter->next >= iter->slen + 2) goto ErrMode;
+		v = (c << 12lu) - (0x0E0 << 12u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+		v += (c << 6u) + d;
+		if ((c|d) >= 0x40 || v < 0x800 ||
+		    !isLegalUnicodeCodePoint(v)) goto ErrMode;
+	} else if (c < 0xF8) {
+		if (iter->next >= iter->slen + 3) goto ErrMode;
+		v = (c << 18lu) - (0x0F0 << 18u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+		e = (unsigned char) ((unsigned) chrs[3] - 0x080);
+		v += (c << 12lu) + (d << 6u) + e;
+		if ((c|d|e) >= 0x40 || v < 0x10000 ||
+		    !isLegalUnicodeCodePoint(v)) goto ErrMode;
+	} else { /* 5 and 6 byte encodings are invalid */
+	ErrMode:;
+		iter->error = 1;
+		v = errCh;
+	}
+	return v;
+}
diff --git a/bstring/utf8util.h b/bstring/utf8util.h
new file mode 100644
index 0000000..2d2a7ca
--- /dev/null
+++ b/bstring/utf8util.h
@@ -0,0 +1,106 @@
+/* Copyright 2002-2015 Paul Hsieh
+ * This file is part of Bstrlib.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ *    3. Neither the name of bstrlib nor the names of its contributors may be
+ *       used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * GNU General Public License Version 2 (the "GPL").
+ */
+
+/**
+ * \file
+ * \brief Interface for low-level UTF-8 utility functions.
+ *
+ * This module is standalone and does not depend on bstrlib.
+ */
+
+#ifndef UTF8_UNICODE_UTILITIES
+#define UTF8_UNICODE_UTILITIES
+
+#include <limits.h>
+
+/* If bstrlib.h has not been included, define the visibility attribute here.
+   The #ifndef guard ensures we don't conflict if bstrlib.h came first. */
+#ifndef BSTR_PUBLIC
+# if __GNUC__ >= 4
+#  define BSTR_PUBLIC __attribute__ ((visibility ("default")))
+# else
+#  define BSTR_PUBLIC
+# endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if INT_MAX >= 0x7fffffffUL
+typedef int cpUcs4;
+#elif LONG_MAX >= 0x7fffffffUL
+typedef long cpUcs4;
+#else
+#error This compiler is not supported
+#endif
+
+#if UINT_MAX == 0xFFFF
+typedef unsigned int cpUcs2;
+#elif USHRT_MAX == 0xFFFF
+typedef unsigned short cpUcs2;
+#elif UCHAR_MAX == 0xFFFF
+typedef unsigned char cpUcs2;
+#else
+#error This compiler is not supported
+#endif
+
+#define isLegalUnicodeCodePoint(v) \
+	((((v) < 0xD800L) || ((v) > 0xDFFFL)) && \
+	 (((unsigned long)(v)) <= 0x0010FFFFL) && \
+	 (((v)|0x1F0001) != 0x1FFFFFL))
+
+struct utf8Iterator {
+	unsigned char *data;
+	int slen;
+	int start, next;
+	int error;
+};
+
+#define utf8IteratorNoMore(it) (!(it) || (it)->next >= (it)->slen)
+
+BSTR_PUBLIC void utf8IteratorInit(struct utf8Iterator *iter,
+                                  unsigned char *data, int slen);
+BSTR_PUBLIC void utf8IteratorUninit(struct utf8Iterator *iter);
+BSTR_PUBLIC cpUcs4 utf8IteratorGetNextCodePoint(struct utf8Iterator *iter,
+                                                cpUcs4 errCh);
+BSTR_PUBLIC cpUcs4 utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter,
+                                                cpUcs4 errCh);
+BSTR_PUBLIC int utf8ScanBackwardsForCodePoint(unsigned char *msg, int len,
+                                              int pos, cpUcs4 *out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* UTF8_UNICODE_UTILITIES */
diff --git a/doc/introduction.md b/doc/introduction.md
index c140d48..11f8ec7 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -327,7 +327,7 @@ object in a multithreaded environment.
 Problems Not Solved
 -------------------
 
-Bstrlib is written for the C languages, which have inherent weaknesses that
+Bstrlib is written for the C language, which has inherent weaknesses that
 cannot be easily solved:
 
 1. Memory leaks: Forgetting to call `bdestroy` on a bstring that is about to
@@ -349,6 +349,29 @@ Other problems not addressed:
 > Note: except for spotty support of wide characters, the default C standard
   library does not address any of these problems either.
 
+Unicode functions
+-----------------
+
+The two modules utf8util.c and buniutil.c implement basic functions for
+parsing and collecting Unicode data in the UTF8 format.  Unicode is
+described by a sequence of "code points" which are values between 0 and
+1114111 inclusive mapped to symbol content corresponding to nearly all
+the standardized scripts of the world.
+
+The semantics of Unicode code points is varied and complicated.  The
+base support of the better string library does not attempt to perform
+any interpretation of these code points.  The better string library
+solely provides support for iterating through unicode code points,
+appending and extracting code points to and from bstrings, and parsing
+UTF8 and UTF16 from raw data.
+
+The types cpUcs4 and cpUcs2 respectively are defined as 4 byte and 2 byte
+encoding formats corresponding to UCS4 and UCS2 respectively.  To test
+if a raw code point is valid, the macro isLegalUnicodeCodePoint() has
+been defined.  The utf8 iterator is defined by struct utf8Iterator.  To
+test if the iterator has more code points to walk through the macro
+utf8IteratorNoMore() has been defined.
+
 The `bstest` Module
 -------------------
 
@@ -871,3 +894,4 @@ and testing of the Better String Library:
 * Richard A. Smith
 * Simon Ekstrom
 * Wayne Scott
+* Zed A. Shaw
diff --git a/meson_options.txt b/meson_options.txt
index 6b0b4fd..5bf78df 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -16,3 +16,9 @@ option(
     value: false,
     description: 'Build unit tests',
 )
+option(
+    'enable-utf8',
+    type: 'boolean',
+    value: true,
+    description: 'Build bstring library with UTF-8 support',
+)
diff --git a/tests/meson.build b/tests/meson.build
index 66b94bf..28ac1c0 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -15,3 +15,15 @@ test_executable_aux = executable(
 
 test('bstring unit tests', test_executable)
 test('bstring auxiliary unit tests', test_executable_aux)
+
+if get_option('enable-utf8')
+    test_executable_utf8 = executable(
+        'testutf8',
+        'testutf8.c',
+        link_with: libbstring,
+        include_directories: bstring_inc,
+        dependencies: check,
+    )
+
+    test('bstring UTF-8 unit tests', test_executable_utf8)
+endif
diff --git a/tests/testutf8.c b/tests/testutf8.c
new file mode 100644
index 0000000..1e28d17
--- /dev/null
+++ b/tests/testutf8.c
@@ -0,0 +1,668 @@
+/* Copyright (C) 2026 Daniel Markstedt <daniel@mindani.net>
+ * UTF-8 unit tests for the Better String Library
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ *    3. Neither the name of bstrlib nor the names of its contributors may be
+ *       used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * GNU General Public License Version 2 (the "GPL").
+ */
+
+/*
+ * This file is the C unit test for the UTF-8 modules (utf8util, buniutil).
+ *
+ * Test data quick reference:
+ *   U+0041 'A'      = 0x41                   (1-byte ASCII)
+ *   U+00A9 '©'      = 0xC2 0xA9              (2-byte)
+ *   U+20AC '€'      = 0xE2 0x82 0xAC         (3-byte)
+ *   U+1F600 '😀'   = 0xF0 0x9F 0x98 0x80    (4-byte)
+ *   UTF-16 U+1F600  = { 0xD83D, 0xDE00 }     (surrogate pair)
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "buniutil.h"
+#include "bstrlib.h"
+#include "utf8util.h"
+#include <check.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* -----------------------------------------------------------------------
+ * core_000: utf8IteratorInit — valid inputs and error inputs
+ * ----------------------------------------------------------------------- */
+START_TEST(core_000)
+{
+	struct utf8Iterator iter;
+	unsigned char data[] = "Hello";
+
+	/* NULL iter pointer must not crash */
+	utf8IteratorInit(NULL, data, 5);
+
+	/* Valid initialisation */
+	utf8IteratorInit(&iter, data, 5);
+	ck_assert_int_eq(iter.slen,  5);
+	ck_assert_int_eq(iter.next,  0);
+	ck_assert_int_eq(iter.start, -1);
+	ck_assert_int_eq(iter.error, 0);
+	ck_assert(iter.data == data);
+
+	/* NULL data → sentinel values */
+	utf8IteratorInit(&iter, NULL, 5);
+	ck_assert_int_eq(iter.slen, -1);
+	ck_assert_int_eq(iter.next, -1);
+	ck_assert_int_eq(iter.error, 1);
+
+	/* Negative slen → sentinel values */
+	utf8IteratorInit(&iter, data, -1);
+	ck_assert_int_eq(iter.slen, -1);
+	ck_assert_int_eq(iter.next, -1);
+	ck_assert_int_eq(iter.error, 1);
+
+	/* Zero-length string is valid */
+	utf8IteratorInit(&iter, data, 0);
+	ck_assert_int_eq(iter.slen,  0);
+	ck_assert_int_eq(iter.next,  0);
+	ck_assert_int_eq(iter.error, 0);
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_001: utf8IteratorUninit — clears all fields; handles NULL gracefully
+ * ----------------------------------------------------------------------- */
+START_TEST(core_001)
+{
+	struct utf8Iterator iter;
+	unsigned char data[] = "Hello";
+
+	utf8IteratorInit(&iter, data, 5);
+	utf8IteratorUninit(&iter);
+	ck_assert(iter.data  == NULL);
+	ck_assert_int_eq(iter.slen,  -1);
+	ck_assert_int_eq(iter.start, -1);
+	ck_assert_int_eq(iter.next,  -1);
+
+	/* NULL pointer must not crash */
+	utf8IteratorUninit(NULL);
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_002: utf8IteratorGetNextCodePoint — ASCII string iteration
+ * ----------------------------------------------------------------------- */
+START_TEST(core_002)
+{
+	struct utf8Iterator iter;
+	unsigned char data[] = "ABC";
+	cpUcs4 cp;
+
+	utf8IteratorInit(&iter, data, 3);
+
+	cp = utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, 'A');
+	ck_assert_int_eq(iter.error, 0);
+	ck_assert_int_eq(iter.start, 0);
+	ck_assert_int_eq(iter.next,  1);
+
+	cp = utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, 'B');
+	ck_assert_int_eq(iter.start, 1);
+	ck_assert_int_eq(iter.next,  2);
+
+	cp = utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, 'C');
+	ck_assert_int_eq(iter.start, 2);
+	ck_assert_int_eq(iter.next,  3);
+
+	/* Past end → errCh */
+	cp = utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, '?');
+
+	/* NULL iterator → errCh */
+	cp = utf8IteratorGetNextCodePoint(NULL, '?');
+	ck_assert_int_eq(cp, '?');
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_003: utf8IteratorGetNextCodePoint — multi-byte sequences
+ *
+ *   Sequence: © (U+00A9, 2-byte) € (U+20AC, 3-byte) 😀 (U+1F600, 4-byte)
+ *   Bytes:    C2 A9                E2 82 AC              F0 9F 98 80
+ * ----------------------------------------------------------------------- */
+START_TEST(core_003)
+{
+	struct utf8Iterator iter;
+	/* © € 😀 */
+	unsigned char data[] = {
+		0xC2, 0xA9,             /* U+00A9 © */
+		0xE2, 0x82, 0xAC,       /* U+20AC € */
+		0xF0, 0x9F, 0x98, 0x80  /* U+1F600 😀 */
+	};
+	cpUcs4 cp;
+
+	utf8IteratorInit(&iter, data, sizeof(data));
+
+	cp = utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, 0x00A9);
+	ck_assert_int_eq(iter.error, 0);
+	ck_assert_int_eq(iter.start, 0);
+	ck_assert_int_eq(iter.next,  2);
+
+	cp = utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, 0x20AC);
+	ck_assert_int_eq(iter.error, 0);
+	ck_assert_int_eq(iter.start, 2);
+	ck_assert_int_eq(iter.next,  5);
+
+	cp = utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, 0x1F600);
+	ck_assert_int_eq(iter.error, 0);
+	ck_assert_int_eq(iter.start, 5);
+	ck_assert_int_eq(iter.next,  9);
+
+	/* Exhausted */
+	cp = utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, '?');
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_004: utf8IteratorGetNextCodePoint — invalid byte sequences
+ *
+ *   0x80 alone is a stray continuation byte (invalid lead).
+ *   0xFF is never valid in UTF-8.
+ * ----------------------------------------------------------------------- */
+START_TEST(core_004)
+{
+	struct utf8Iterator iter;
+	/* stray continuation, then a valid ASCII char */
+	unsigned char data_cont[] = { 0x80, 0x41 };
+	/* 0xFF is always invalid */
+	unsigned char data_ff[]   = { 0xFF, 0x41 };
+	cpUcs4 cp;
+
+	/* Stray continuation byte → error, iterator skips to next valid lead */
+	utf8IteratorInit(&iter, data_cont, sizeof(data_cont));
+	cp = utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, '?');
+	ck_assert_int_eq(iter.error, 1);
+	/* After error the iterator should have advanced past the bad byte(s) */
+	ck_assert(iter.next > 0);
+
+	/* 0xFF lead byte → error */
+	utf8IteratorInit(&iter, data_ff, sizeof(data_ff));
+	cp = utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, '?');
+	ck_assert_int_eq(iter.error, 1);
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_005: utf8IteratorGetCurrCodePoint — peek without advancing
+ * ----------------------------------------------------------------------- */
+START_TEST(core_005)
+{
+	struct utf8Iterator iter;
+	unsigned char data[] = { 0xC2, 0xA9, 0x41 }; /* © A */
+	cpUcs4 cp;
+
+	utf8IteratorInit(&iter, data, sizeof(data));
+
+	/* Peek twice at the same position — must not advance */
+	cp = utf8IteratorGetCurrCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, 0x00A9);
+	ck_assert_int_eq(iter.next, 0); /* still at start */
+
+	cp = utf8IteratorGetCurrCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, 0x00A9);
+	ck_assert_int_eq(iter.next, 0);
+
+	/* Now advance with GetNext, then peek the next char */
+	utf8IteratorGetNextCodePoint(&iter, '?');
+	ck_assert_int_eq(iter.next, 2);
+
+	cp = utf8IteratorGetCurrCodePoint(&iter, '?');
+	ck_assert_int_eq(cp, 0x41); /* 'A' */
+	ck_assert_int_eq(iter.next, 2); /* still not advanced */
+
+	/* NULL iterator → errCh */
+	cp = utf8IteratorGetCurrCodePoint(NULL, '?');
+	ck_assert_int_eq(cp, '?');
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_006: utf8ScanBackwardsForCodePoint — various positions
+ *
+ *   Data: © (0xC2 0xA9) at bytes 0-1, then 'A' (0x41) at byte 2
+ * ----------------------------------------------------------------------- */
+START_TEST(core_006)
+{
+	unsigned char data[] = { 0xC2, 0xA9, 0x41 }; /* © A */
+	cpUcs4 out;
+	int ret;
+
+	/* pos=0 is the lead byte of © — ret=0, out=0xA9 */
+	ret = utf8ScanBackwardsForCodePoint(data, 3, 0, &out);
+	ck_assert_int_eq(ret, 0);
+	ck_assert_int_eq(out, 0x00A9);
+
+	/* pos=1 is the continuation byte — ret=1 (1 byte back to lead), out=0xA9 */
+	ret = utf8ScanBackwardsForCodePoint(data, 3, 1, &out);
+	ck_assert_int_eq(ret, 1);
+	ck_assert_int_eq(out, 0x00A9);
+
+	/* pos=2 is ASCII 'A' — ret=0, out='A' */
+	ret = utf8ScanBackwardsForCodePoint(data, 3, 2, &out);
+	ck_assert_int_eq(ret, 0);
+	ck_assert_int_eq(out, 0x41);
+
+	/* NULL msg → error (negative) */
+	ret = utf8ScanBackwardsForCodePoint(NULL, 3, 0, &out);
+	ck_assert(ret < 0);
+
+	/* pos out of range → error */
+	ret = utf8ScanBackwardsForCodePoint(data, 3, 3, &out);
+	ck_assert(ret < 0);
+
+	/* out=NULL is accepted; return value indicates success/failure */
+	ret = utf8ScanBackwardsForCodePoint(data, 3, 2, NULL);
+	ck_assert_int_eq(ret, 0);
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_007: buIsUTF8Content
+ * ----------------------------------------------------------------------- */
+START_TEST(core_007)
+{
+	bstring b;
+	int ret;
+
+	/* NULL bstring → 0 */
+	ret = buIsUTF8Content(NULL);
+	ck_assert_int_eq(ret, 0);
+
+	/* Empty string → 1 (vacuously true) */
+	b = bfromcstr("");
+	ck_assert(b != NULL);
+	ret = buIsUTF8Content(b);
+	ck_assert_int_eq(ret, 1);
+	bdestroy(b);
+
+	/* Pure ASCII → 1 */
+	b = bfromcstr("Hello, world!");
+	ck_assert(b != NULL);
+	ret = buIsUTF8Content(b);
+	ck_assert_int_eq(ret, 1);
+	bdestroy(b);
+
+	/* Valid multi-byte UTF-8: © € 😀 */
+	{
+		unsigned char utf8[] = {
+			0xC2, 0xA9,
+			0xE2, 0x82, 0xAC,
+			0xF0, 0x9F, 0x98, 0x80
+		};
+		b = blk2bstr(utf8, sizeof(utf8));
+		ck_assert(b != NULL);
+		ret = buIsUTF8Content(b);
+		ck_assert_int_eq(ret, 1);
+		bdestroy(b);
+	}
+
+	/* Invalid: stray 0x80 continuation byte → 0 */
+	{
+		unsigned char bad[] = { 0x41, 0x80, 0x41 };
+		b = blk2bstr(bad, sizeof(bad));
+		ck_assert(b != NULL);
+		ret = buIsUTF8Content(b);
+		ck_assert_int_eq(ret, 0);
+		bdestroy(b);
+	}
+
+	/* Invalid: truncated 2-byte sequence → 0 */
+	{
+		unsigned char bad[] = { 0xC2 }; /* lead without continuation */
+		b = blk2bstr(bad, sizeof(bad));
+		ck_assert(b != NULL);
+		ret = buIsUTF8Content(b);
+		ck_assert_int_eq(ret, 0);
+		bdestroy(b);
+	}
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_008: buAppendBlkUcs4 — UCS-4 array → UTF-8 bstring
+ * ----------------------------------------------------------------------- */
+START_TEST(core_008)
+{
+	bstring b;
+	int ret;
+
+	/* NULL arguments → BSTR_ERR */
+	b = bfromcstr("");
+	ck_assert(b != NULL);
+	ret = buAppendBlkUcs4(NULL, NULL, 0, '?');
+	ck_assert_int_eq(ret, BSTR_ERR);
+	ret = buAppendBlkUcs4(b, NULL, 1, '?');
+	ck_assert_int_eq(ret, BSTR_ERR);
+	bdestroy(b);
+
+	/* ASCII code points */
+	{
+		cpUcs4 pts[] = { 'H', 'i' };
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUcs4(b, pts, 2, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 2);
+		ck_assert_int_eq(b->data[0], 'H');
+		ck_assert_int_eq(b->data[1], 'i');
+		bdestroy(b);
+	}
+
+	/* Mixed: © (U+00A9) and € (U+20AC) */
+	{
+		cpUcs4 pts[] = { 0x00A9, 0x20AC };
+		unsigned char expected[] = {
+			0xC2, 0xA9,       /* © */
+			0xE2, 0x82, 0xAC  /* € */
+		};
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUcs4(b, pts, 2, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 5);
+		ret = memcmp(b->data, expected, 5);
+		ck_assert_int_eq(ret, 0);
+		bdestroy(b);
+	}
+
+	/* 4-byte: 😀 (U+1F600) */
+	{
+		cpUcs4 pts[] = { 0x1F600 };
+		unsigned char expected[] = { 0xF0, 0x9F, 0x98, 0x80 };
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUcs4(b, pts, 1, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 4);
+		ret = memcmp(b->data, expected, 4);
+		ck_assert_int_eq(ret, 0);
+		bdestroy(b);
+	}
+
+	/* Invalid code point with valid errCh → substituted */
+	{
+		cpUcs4 pts[] = { 0xD800 }; /* surrogates are illegal */
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUcs4(b, pts, 1, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 1);
+		ck_assert_int_eq(b->data[0], '?');
+		bdestroy(b);
+	}
+
+	/* Invalid code point with invalid errCh → BSTR_ERR, bstring unchanged */
+	{
+		cpUcs4 pts[] = { 0xD800 };
+		b = bfromcstr("pre");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUcs4(b, pts, 1, 0xD800); /* errCh also invalid */
+		ck_assert_int_eq(ret, BSTR_ERR);
+		/* slen must be rolled back */
+		ck_assert_int_eq(b->slen, 3);
+		bdestroy(b);
+	}
+
+	/* Zero-length array → BSTR_OK, nothing appended */
+	{
+		cpUcs4 pts[] = { 'X' };
+		b = bfromcstr("pre");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUcs4(b, pts, 0, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 3);
+		bdestroy(b);
+	}
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_009: buGetBlkUTF16 — UTF-8 bstring → UTF-16 array
+ * ----------------------------------------------------------------------- */
+START_TEST(core_009)
+{
+	cpUcs2 buf[16];
+	int ret;
+
+	/* NULL arguments → BSTR_ERR */
+	{
+		unsigned char raw[] = { 0x41 };
+		bstring b = blk2bstr(raw, 1);
+		ck_assert(b != NULL);
+		ret = buGetBlkUTF16(NULL, 4, '?', b, 0);
+		ck_assert_int_eq(ret, BSTR_ERR);
+		ret = buGetBlkUTF16(buf, 0, '?', b, 0);
+		ck_assert_int_eq(ret, BSTR_ERR);
+		ret = buGetBlkUTF16(buf, 4, '?', NULL, 0);
+		ck_assert_int_eq(ret, BSTR_ERR);
+		ret = buGetBlkUTF16(buf, 4, '?', b, -1);
+		ck_assert_int_eq(ret, BSTR_ERR);
+		bdestroy(b);
+	}
+
+	/* ASCII "AB" → UTF-16 { 0x0041, 0x0042, 0, ... } */
+	{
+		bstring b = bfromcstr("AB");
+		ck_assert(b != NULL);
+		memset(buf, 0xFF, sizeof(buf));
+		ret = buGetBlkUTF16(buf, 4, '?', b, 0);
+		ck_assert_int_eq(ret, 2);
+		ck_assert_int_eq(buf[0], 0x0041);
+		ck_assert_int_eq(buf[1], 0x0042);
+		ck_assert_int_eq(buf[2], 0); /* null-padded */
+		bdestroy(b);
+	}
+
+	/* © € → UTF-16 BMP values (U+00A9, U+20AC) */
+	{
+		unsigned char raw[] = {
+			0xC2, 0xA9,
+			0xE2, 0x82, 0xAC
+		};
+		bstring b = blk2bstr(raw, sizeof(raw));
+		ck_assert(b != NULL);
+		memset(buf, 0xFF, sizeof(buf));
+		ret = buGetBlkUTF16(buf, 4, '?', b, 0);
+		ck_assert_int_eq(ret, 2);
+		ck_assert_int_eq(buf[0], 0x00A9);
+		ck_assert_int_eq(buf[1], 0x20AC);
+		bdestroy(b);
+	}
+
+	/* pos=1 skips first code point */
+	{
+		bstring b = bfromcstr("AB");
+		ck_assert(b != NULL);
+		memset(buf, 0, sizeof(buf));
+		ret = buGetBlkUTF16(buf, 4, '?', b, 1);
+		ck_assert_int_eq(ret, 1);
+		ck_assert_int_eq(buf[0], 0x0042);
+		bdestroy(b);
+	}
+
+	/* Supplementary character 😀 (U+1F600) → surrogate pair */
+	{
+		unsigned char raw[] = { 0xF0, 0x9F, 0x98, 0x80 };
+		bstring b = blk2bstr(raw, sizeof(raw));
+		ck_assert(b != NULL);
+		memset(buf, 0, sizeof(buf));
+		ret = buGetBlkUTF16(buf, 4, '?', b, 0);
+		ck_assert_int_eq(ret, 2); /* one code point → two UTF-16 units */
+		ck_assert_int_eq(buf[0], 0xD83D); /* high surrogate */
+		ck_assert_int_eq(buf[1], 0xDE00); /* low surrogate */
+		bdestroy(b);
+	}
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_010: buAppendBlkUTF16 — UTF-16 array → UTF-8 bstring
+ * ----------------------------------------------------------------------- */
+START_TEST(core_010)
+{
+	bstring b;
+	int ret;
+
+	/* NULL / bad arguments → BSTR_ERR */
+	b = bfromcstr("");
+	ck_assert(b != NULL);
+	{
+		cpUcs2 u[] = { 0x0041 };
+		ret = buAppendBlkUTF16(NULL, u, 1, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_ERR);
+		ret = buAppendBlkUTF16(b, NULL, 1, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_ERR);
+		ret = buAppendBlkUTF16(b, u, -1, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_ERR);
+	}
+	bdestroy(b);
+
+	/* Zero-length input → BSTR_OK, nothing appended */
+	{
+		cpUcs2 u[] = { 0x0041 };
+		b = bfromcstr("pre");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 0, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 3);
+		bdestroy(b);
+	}
+
+	/* ASCII "AB" in UTF-16 → "AB" in UTF-8 */
+	{
+		cpUcs2 u[] = { 0x0041, 0x0042 };
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 2, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 2);
+		ck_assert_int_eq(b->data[0], 'A');
+		ck_assert_int_eq(b->data[1], 'B');
+		bdestroy(b);
+	}
+
+	/* BMP characters: U+00A9 © and U+20AC € */
+	{
+		cpUcs2 u[] = { 0x00A9, 0x20AC };
+		unsigned char expected[] = {
+			0xC2, 0xA9,
+			0xE2, 0x82, 0xAC
+		};
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 2, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 5);
+		ret = memcmp(b->data, expected, 5);
+		ck_assert_int_eq(ret, 0);
+		bdestroy(b);
+	}
+
+	/* Surrogate pair: 😀 (U+1F600) = { 0xD83D, 0xDE00 } */
+	{
+		cpUcs2 u[] = { 0xD83D, 0xDE00 };
+		unsigned char expected[] = { 0xF0, 0x9F, 0x98, 0x80 };
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 2, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 4);
+		ret = memcmp(b->data, expected, 4);
+		ck_assert_int_eq(ret, 0);
+		bdestroy(b);
+	}
+
+	/* Little-endian BOM (0xFFFE) → byte-swapped input */
+	{
+		/* 'A' (0x0041) with bytes swapped = 0x4100, plus LE BOM */
+		cpUcs2 u[] = { 0xFFFE, 0x4100 };
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 2, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 1);
+		ck_assert_int_eq(b->data[0], 'A');
+		bdestroy(b);
+	}
+
+	/* Big-endian BOM (0xFEFF) is consumed and removed from output */
+	{
+		cpUcs2 u[] = { 0xFEFF, 0x0041 };
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 2, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 1);
+		ck_assert_int_eq(b->data[0], 'A');
+		bdestroy(b);
+	}
+}
+END_TEST
+
+int
+main(void)
+{
+	/* Build test suite */
+	Suite *suite = suite_create("bstr-utf8");
+	/* Core tests */
+	TCase *core = tcase_create("Core");
+	tcase_add_test(core, core_000);
+	tcase_add_test(core, core_001);
+	tcase_add_test(core, core_002);
+	tcase_add_test(core, core_003);
+	tcase_add_test(core, core_004);
+	tcase_add_test(core, core_005);
+	tcase_add_test(core, core_006);
+	tcase_add_test(core, core_007);
+	tcase_add_test(core, core_008);
+	tcase_add_test(core, core_009);
+	tcase_add_test(core, core_010);
+	suite_add_tcase(suite, core);
+	/* Run tests */
+	SRunner *runner = srunner_create(suite);
+	srunner_run_all(runner, CK_ENV);
+	int number_failed = srunner_ntests_failed(runner);
+	srunner_free(runner);
+	return (0 == number_failed) ? EXIT_SUCCESS : EXIT_FAILURE;
+}

From 2fcdc0fab4aa113e4b4e5a91936e46ad8e7b997f Mon Sep 17 00:00:00 2001
From: Daniel Markstedt <daniel@mindani.net>
Date: Tue, 24 Feb 2026 07:37:02 +0100
Subject: [PATCH 2/6] address static analysis bugs flagged by SonarQube

---
 bstring/buniutil.c | 27 +++++++++++++++++----------
 bstring/utf8util.c | 23 ++++++++++++++++-------
 bstring/utf8util.h |  7 ++++---
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/bstring/buniutil.c b/bstring/buniutil.c
index 004a061..ceafae7 100644
--- a/bstring/buniutil.c
+++ b/bstring/buniutil.c
@@ -78,7 +78,8 @@ buGetBlkUTF16(/* @out */ cpUcs2 *ucs2, int len, cpUcs4 errCh,
 	struct tagbstring t;
 	struct utf8Iterator iter;
 	cpUcs4 ucs4;
-	int i, j;
+	int i;
+	int j;
 
 	if (!isLegalUnicodeCodePoint(errCh))
 		errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
@@ -99,9 +100,10 @@ buGetBlkUTF16(/* @out */ cpUcs2 *ucs2, int len, cpUcs4 errCh,
 	utf8IteratorInit(&iter, t.data, t.slen);
 
 	ucs4 = BSTR_ERR;
-	for (i=0; 0 < len && iter.next < iter.slen &&
-	          0 <= (ucs4 = utf8IteratorGetNextCodePoint(&iter, errCh));
-	     i++) {
+	for (i=0; 0 < len && iter.next < iter.slen; i++) {
+		ucs4 = utf8IteratorGetNextCodePoint(&iter, errCh);
+		if (0 > ucs4) break;
+
 		if (ucs4 < 0x10000) {
 			*ucs2++ = (cpUcs2) ucs4;
 			len--;
@@ -155,13 +157,14 @@ UTF-32: U-000000 - U-10FFFF
 int
 buAppendBlkUcs4(bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh)
 {
-	int i, oldSlen;
+	int oldSlen;
 
-	if (NULL == bu || NULL == b || 0 > len ||
-	    0 > (oldSlen = blengthe(b, -1))) return BSTR_ERR;
+	if (NULL == bu || NULL == b || 0 > len) return BSTR_ERR;
+	oldSlen = blengthe(b, -1);
+	if (0 > oldSlen) return BSTR_ERR;
 	if (!isLegalUnicodeCodePoint(errCh)) errCh = ~0;
 
-	for (i=0; i < len; i++) {
+	for (int i=0; i < len; i++) {
 		unsigned char c[6];
 		cpUcs4 v = bu[i];
 
@@ -255,7 +258,10 @@ buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom,
                  cpUcs4 errCh)
 {
 	cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE];
-	int cc, i, sm, oldSlen;
+	int cc;
+	int i;
+	int sm;
+	int oldSlen;
 
 	if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR;
 	if (!isLegalUnicodeCodePoint(errCh)) errCh = ~0;
@@ -284,7 +290,8 @@ buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom,
 
 	cc = 0;
 	for (; i < len; i++) {
-		cpUcs4 c, v;
+		cpUcs4 c;
+		cpUcs4 v;
 		v = endSwap(utf16[i], sm);
 
 		if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */
diff --git a/bstring/utf8util.c b/bstring/utf8util.c
index 48cf540..c6e2a80 100644
--- a/bstring/utf8util.c
+++ b/bstring/utf8util.c
@@ -78,10 +78,14 @@ utf8IteratorUninit(struct utf8Iterator *iter)
 }
 
 int
-utf8ScanBackwardsForCodePoint(unsigned char *msg, int len, int pos,
+utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos,
                                cpUcs4 *out)
 {
-	cpUcs4 v1, v2, v3, v4, x;
+	cpUcs4 v1;
+	cpUcs4 v2;
+	cpUcs4 v3;
+	cpUcs4 v4;
+	cpUcs4 x;
 	int ret;
 	if (NULL == msg || len < 0 || (unsigned) pos >= (unsigned) len) {
 		return -__LINE__;
@@ -173,10 +177,13 @@ U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 cpUcs4
 utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
 {
-	unsigned char *chrs;
-	unsigned char c, d, e;
+	const unsigned char *chrs;
+	unsigned char c;
+	unsigned char d;
+	unsigned char e;
 	long v;
-	int i, ofs;
+	int i;
+	int ofs;
 
 	if (NULL == iter || iter->next < 0) return errCh;
 	if (iter->next >= iter->slen) {
@@ -249,8 +256,10 @@ utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
 cpUcs4
 utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
 {
-	unsigned char *chrs;
-	unsigned char c, d, e;
+	const unsigned char *chrs;
+	unsigned char c;
+	unsigned char d;
+	unsigned char e;
 	long v;
 
 	if (NULL == iter || iter->next < 0) return errCh;
diff --git a/bstring/utf8util.h b/bstring/utf8util.h
index 2d2a7ca..84aeda0 100644
--- a/bstring/utf8util.h
+++ b/bstring/utf8util.h
@@ -83,7 +83,8 @@ typedef unsigned char cpUcs2;
 struct utf8Iterator {
 	unsigned char *data;
 	int slen;
-	int start, next;
+	int start;
+	int next;
 	int error;
 };
 
@@ -96,8 +97,8 @@ BSTR_PUBLIC cpUcs4 utf8IteratorGetNextCodePoint(struct utf8Iterator *iter,
                                                 cpUcs4 errCh);
 BSTR_PUBLIC cpUcs4 utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter,
                                                 cpUcs4 errCh);
-BSTR_PUBLIC int utf8ScanBackwardsForCodePoint(unsigned char *msg, int len,
-                                              int pos, cpUcs4 *out);
+BSTR_PUBLIC int utf8ScanBackwardsForCodePoint(const unsigned char *msg,
+											  int len, int pos, cpUcs4 *out);
 
 #ifdef __cplusplus
 }

From 1b35a07d0f7a52653c784afa97c05583121c536a Mon Sep 17 00:00:00 2001
From: Daniel Markstedt <daniel@mindani.net>
Date: Tue, 24 Feb 2026 07:56:32 +0100
Subject: [PATCH 3/6] refactor surrogate substitution with flatter control flow

---
 bstring/buniutil.c |  49 ++++++++++++-------
 tests/testutf8.c   | 115 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+), 17 deletions(-)

diff --git a/bstring/buniutil.c b/bstring/buniutil.c
index ceafae7..fa6d37e 100644
--- a/bstring/buniutil.c
+++ b/bstring/buniutil.c
@@ -289,36 +289,51 @@ buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom,
 	}
 
 	cc = 0;
-	for (; i < len; i++) {
-		cpUcs4 c;
+	while (i < len) {
 		cpUcs4 v;
+		int invalid = 0;
+
 		v = endSwap(utf16[i], sm);
+		i++;
 
 		if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */
-			if (v >= 0xDC00 || i >= len) {
-			ErrMode:;
-				if (~0 == errCh) {
-				ErrReturn:;
-					bu->slen = oldSlen;
-					return BSTR_ERR;
-				}
-				v = errCh;
+			if (v >= 0xDC00) {
+				invalid = 1; /* Isolated low surrogate */
+			} else if (i >= len) {
+				invalid = 1; /* Unterminated high surrogate */
 			} else {
-				i++;
-				if ((c = endSwap(utf16[i], sm) - 0xDC00) > 0x3FF)
-					goto ErrMode;
-				v = ((v - 0xD800) << 10) + c + 0x10000;
+				cpUcs4 c = endSwap(utf16[i], sm);
+				if (c < 0xDC00 || c > 0xDFFF) {
+					invalid = 1;
+				} else {
+					i++;
+					v = ((v - 0xD800) << 10) + (c - 0xDC00) + 0x10000;
+				}
+			}
+		}
+
+		if (invalid) {
+			if (~0 == errCh) {
+				bu->slen = oldSlen;
+				return BSTR_ERR;
 			}
+			v = errCh;
 		}
+
 		buff[cc] = v;
 		cc++;
 		if (cc >= TEMP_UCS4_BUFFER_SIZE) {
-			if (0 > buAppendBlkUcs4(bu, buff, cc, errCh))
-				goto ErrReturn;
+			if (0 > buAppendBlkUcs4(bu, buff, cc, errCh)) {
+				bu->slen = oldSlen;
+				return BSTR_ERR;
+			}
 			cc = 0;
 		}
 	}
-	if (cc > 0 && 0 > buAppendBlkUcs4(bu, buff, cc, errCh)) goto ErrReturn;
+	if (cc > 0 && 0 > buAppendBlkUcs4(bu, buff, cc, errCh)) {
+		bu->slen = oldSlen;
+		return BSTR_ERR;
+	}
 
 	return BSTR_OK;
 }
diff --git a/tests/testutf8.c b/tests/testutf8.c
index 1e28d17..253ba84 100644
--- a/tests/testutf8.c
+++ b/tests/testutf8.c
@@ -637,6 +637,120 @@ START_TEST(core_010)
 		ck_assert_int_eq(b->data[0], 'A');
 		bdestroy(b);
 	}
+
+	/* Invalid low surrogate alone with valid errCh → substituted */
+	{
+		cpUcs2 u[] = { 0xDC00 };
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 1, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 1);
+		ck_assert_int_eq(b->data[0], '?');
+		bdestroy(b);
+	}
+
+	/* Invalid low surrogate then ASCII with valid errCh */
+	{
+		cpUcs2 u[] = { 0xDC00, 0x0041 };
+		unsigned char expected[] = { '?', 'A' };
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 2, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 2);
+		ret = memcmp(b->data, expected, 2);
+		ck_assert_int_eq(ret, 0);
+		bdestroy(b);
+	}
+
+	/* Invalid surrogate with invalid errCh → BSTR_ERR and rollback */
+	{
+		cpUcs2 u[] = { 0xDC00 };
+		b = bfromcstr("pre");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 1, NULL, 0xD800); /* invalid errCh */
+		ck_assert_int_eq(ret, BSTR_ERR);
+		ck_assert_int_eq(b->slen, 3); /* unchanged */
+		ck_assert_int_eq(b->data[0], 'p');
+		ck_assert_int_eq(b->data[1], 'r');
+		ck_assert_int_eq(b->data[2], 'e');
+		bdestroy(b);
+	}
+
+	/* bom out-parameter gets set when BOM appears in stream */
+	{
+		cpUcs2 in_bom = 0;
+		cpUcs2 u[] = { 0xFEFF, 0x0041 };
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 2, &in_bom, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(in_bom, 0xFEFF);
+		ck_assert_int_eq(b->slen, 1);
+		ck_assert_int_eq(b->data[0], 'A');
+		bdestroy(b);
+	}
+
+	/* Pre-seeded bom controls endianness even without BOM in input */
+	{
+		cpUcs2 in_bom = 0xFFFE;
+		cpUcs2 u[] = { 0x4100 }; /* bytes for 0x0041 in opposite endian */
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 1, &in_bom, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(in_bom, 0xFFFE); /* preserved */
+		ck_assert_int_eq(b->slen, 1);
+		ck_assert_int_eq(b->data[0], 'A');
+		bdestroy(b);
+	}
+
+	/* Larger than TEMP_UCS4_BUFFER_SIZE exercises internal flush path */
+	{
+		cpUcs2 u[80];
+		for (int j = 0; j < 80; j++) {
+			u[j] = (cpUcs2)('A' + (j % 26));
+		}
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 80, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 80);
+		for (int j = 0; j < 80; j++) {
+			ck_assert_int_eq(b->data[j], 'A' + (j % 26));
+		}
+		bdestroy(b);
+	}
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_011: regression guard
+ *
+ * Guard against regressions for:
+ *   high surrogate followed by non-low surrogate.
+ *
+ * Expected behavior is:
+ *   first code unit substituted with errCh, second processed normally.
+ * ----------------------------------------------------------------------- */
+START_TEST(core_011)
+{
+	bstring b;
+	int ret;
+
+	{
+		cpUcs2 u[] = { 0xD83D, 0x0041 };
+		unsigned char expected[] = { '?', 'A' };
+		b = bfromcstr("");
+		ck_assert(b != NULL);
+		ret = buAppendBlkUTF16(b, u, 2, NULL, '?');
+		ck_assert_int_eq(ret, BSTR_OK);
+		ck_assert_int_eq(b->slen, 2);
+		ret = memcmp(b->data, expected, 2);
+		ck_assert_int_eq(ret, 0);
+		bdestroy(b);
+	}
 }
 END_TEST
 
@@ -658,6 +772,7 @@ main(void)
 	tcase_add_test(core, core_008);
 	tcase_add_test(core, core_009);
 	tcase_add_test(core, core_010);
+	tcase_add_test(core, core_011);
 	suite_add_tcase(suite, core);
 	/* Run tests */
 	SRunner *runner = srunner_create(suite);

From 4ae9b8a97704de0e18178a1960e610f9b1e6fd89 Mon Sep 17 00:00:00 2001
From: Daniel Markstedt <daniel@mindani.net>
Date: Tue, 24 Feb 2026 19:12:46 +0100
Subject: [PATCH 4/6] fix truncation bounds check bug and use flag for error
 handling

---
 bstring/utf8util.c | 143 +++++++++++++++++++++++++++++----------------
 tests/testutf8.c   | 112 ++++++++++++++++++++++++++++++-----
 2 files changed, 192 insertions(+), 63 deletions(-)

diff --git a/bstring/utf8util.c b/bstring/utf8util.c
index c6e2a80..3ce310f 100644
--- a/bstring/utf8util.c
+++ b/bstring/utf8util.c
@@ -184,6 +184,7 @@ utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
 	long v;
 	int i;
 	int ofs;
+	int invalid;
 
 	if (NULL == iter || iter->next < 0) return errCh;
 	if (iter->next >= iter->slen) {
@@ -197,39 +198,64 @@ utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
 	iter->error = 0;
 	c = chrs[0];
 	ofs = 0;
+	invalid = 0;
 
 	if (c < 0xC0 || c > 0xFD) {
-		if (c >= 0x80) goto ErrMode;
-		v = c;
-		ofs = 1;
+		if (c >= 0x80) {
+			invalid = 1;
+		} else {
+			v = c;
+			ofs = 1;
+		}
 	} else if (c < 0xE0) {
-		if (iter->next >= iter->slen + 1) goto ErrMode;
-		v = (c << 6u) - (0x0C0 << 6u);
-		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
-		v += c;
-		if (c >= 0x40 || v < 0x80) goto ErrMode;
-		ofs = 2;
+		if (iter->next + 1 >= iter->slen) {
+			invalid = 1;
+		} else {
+			v = (c << 6u) - (0x0C0 << 6u);
+			c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+			v += c;
+			if (c >= 0x40 || v < 0x80) {
+				invalid = 1;
+			} else {
+				ofs = 2;
+			}
+		}
 	} else if (c < 0xF0) {
-		if (iter->next >= iter->slen + 2) goto ErrMode;
-		v = (c << 12) - (0x0E0 << 12u);
-		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
-		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
-		v += (c << 6u) + d;
-		if ((c|d) >= 0x40 || v < 0x800 ||
-		    !isLegalUnicodeCodePoint(v)) goto ErrMode;
-		ofs = 3;
+		if (iter->next + 2 >= iter->slen) {
+			invalid = 1;
+		} else {
+			v = (c << 12) - (0x0E0 << 12u);
+			c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+			d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+			v += (c << 6u) + d;
+			if ((c|d) >= 0x40 || v < 0x800 ||
+			    !isLegalUnicodeCodePoint(v)) {
+				invalid = 1;
+			} else {
+				ofs = 3;
+			}
+		}
 	} else if (c < 0xF8) {
-		if (iter->next >= iter->slen + 3) goto ErrMode;
-		v = (c << 18) - (0x0F0 << 18u);
-		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
-		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
-		e = (unsigned char) ((unsigned) chrs[3] - 0x080);
-		v += (c << 12u) + (d << 6u) + e;
-		if ((c|d|e) >= 0x40 || v < 0x10000 ||
-		    !isLegalUnicodeCodePoint(v)) goto ErrMode;
-		ofs = 4;
+		if (iter->next + 3 >= iter->slen) {
+			invalid = 1;
+		} else {
+			v = (c << 18) - (0x0F0 << 18u);
+			c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+			d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+			e = (unsigned char) ((unsigned) chrs[3] - 0x080);
+			v += (c << 12u) + (d << 6u) + e;
+			if ((c|d|e) >= 0x40 || v < 0x10000 ||
+			    !isLegalUnicodeCodePoint(v)) {
+				invalid = 1;
+			} else {
+				ofs = 4;
+			}
+		}
 	} else { /* 5 and 6 byte encodings are invalid */
-	ErrMode:;
+		invalid = 1;
+	}
+
+	if (invalid) {
 		iter->error = 1;
 		v = errCh;
 		for (i = iter->next+1; i < iter->slen; i++) {
@@ -261,6 +287,7 @@ utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
 	unsigned char d;
 	unsigned char e;
 	long v;
+	int invalid;
 
 	if (NULL == iter || iter->next < 0) return errCh;
 	if (iter->next >= iter->slen) {
@@ -273,35 +300,51 @@ utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
 
 	iter->error = 0;
 	c = chrs[0];
+	invalid = 0;
 
 	if (c < 0xC0 || c > 0xFD) {
-		if (c >= 0x80) goto ErrMode;
-		v = c;
+		if (c >= 0x80) {
+			invalid = 1;
+		} else {
+			v = c;
+		}
 	} else if (c < 0xE0) {
-		if (iter->next >= iter->slen + 1) goto ErrMode;
-		v = (c << 6u) - (0x0C0 << 6u);
-		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
-		v += c;
-		if (c >= 0x40 || v < 0x80) goto ErrMode;
+		if (iter->next + 1 >= iter->slen) {
+			invalid = 1;
+		} else {
+			v = (c << 6u) - (0x0C0 << 6u);
+			c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+			v += c;
+			if (c >= 0x40 || v < 0x80) invalid = 1;
+		}
 	} else if (c < 0xF0) {
-		if (iter->next >= iter->slen + 2) goto ErrMode;
-		v = (c << 12lu) - (0x0E0 << 12u);
-		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
-		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
-		v += (c << 6u) + d;
-		if ((c|d) >= 0x40 || v < 0x800 ||
-		    !isLegalUnicodeCodePoint(v)) goto ErrMode;
+		if (iter->next + 2 >= iter->slen) {
+			invalid = 1;
+		} else {
+			v = (c << 12lu) - (0x0E0 << 12u);
+			c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+			d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+			v += (c << 6u) + d;
+			if ((c|d) >= 0x40 || v < 0x800 ||
+			    !isLegalUnicodeCodePoint(v)) invalid = 1;
+		}
 	} else if (c < 0xF8) {
-		if (iter->next >= iter->slen + 3) goto ErrMode;
-		v = (c << 18lu) - (0x0F0 << 18u);
-		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
-		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
-		e = (unsigned char) ((unsigned) chrs[3] - 0x080);
-		v += (c << 12lu) + (d << 6u) + e;
-		if ((c|d|e) >= 0x40 || v < 0x10000 ||
-		    !isLegalUnicodeCodePoint(v)) goto ErrMode;
+		if (iter->next + 3 >= iter->slen) {
+			invalid = 1;
+		} else {
+			v = (c << 18lu) - (0x0F0 << 18u);
+			c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+			d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+			e = (unsigned char) ((unsigned) chrs[3] - 0x080);
+			v += (c << 12lu) + (d << 6u) + e;
+			if ((c|d|e) >= 0x40 || v < 0x10000 ||
+			    !isLegalUnicodeCodePoint(v)) invalid = 1;
+		}
 	} else { /* 5 and 6 byte encodings are invalid */
-	ErrMode:;
+		invalid = 1;
+	}
+
+	if (invalid) {
 		iter->error = 1;
 		v = errCh;
 	}
diff --git a/tests/testutf8.c b/tests/testutf8.c
index 253ba84..e5545a6 100644
--- a/tests/testutf8.c
+++ b/tests/testutf8.c
@@ -224,9 +224,93 @@ START_TEST(core_004)
 END_TEST
 
 /* -----------------------------------------------------------------------
- * core_005: utf8IteratorGetCurrCodePoint — peek without advancing
+ * core_005: utf8IteratorGetNextCodePoint — truncated sequence bounds checks
+ *
+ * The backing arrays contain full valid code points, but slen is set so the
+ * sequence is truncated at the end. Iterator must treat each as invalid and
+ * return errCh instead of decoding bytes past slen.
  * ----------------------------------------------------------------------- */
 START_TEST(core_005)
+{
+	struct utf8Iterator iter;
+	cpUcs4 cp;
+
+	{
+		unsigned char data[] = { 0xC2, 0xA9 };
+		utf8IteratorInit(&iter, data, 1);
+		cp = utf8IteratorGetNextCodePoint(&iter, '?');
+		ck_assert_int_eq(cp, '?');
+		ck_assert_int_eq(iter.error, 1);
+		ck_assert_int_eq(iter.start, 0);
+		ck_assert_int_eq(iter.next, 1);
+	}
+
+	{
+		unsigned char data[] = { 0xE2, 0x82, 0xAC };
+		utf8IteratorInit(&iter, data, 2);
+		cp = utf8IteratorGetNextCodePoint(&iter, '?');
+		ck_assert_int_eq(cp, '?');
+		ck_assert_int_eq(iter.error, 1);
+		ck_assert_int_eq(iter.start, 0);
+		ck_assert_int_eq(iter.next, 2);
+	}
+
+	{
+		unsigned char data[] = { 0xF0, 0x9F, 0x98, 0x80 };
+		utf8IteratorInit(&iter, data, 3);
+		cp = utf8IteratorGetNextCodePoint(&iter, '?');
+		ck_assert_int_eq(cp, '?');
+		ck_assert_int_eq(iter.error, 1);
+		ck_assert_int_eq(iter.start, 0);
+		ck_assert_int_eq(iter.next, 3);
+	}
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_006: utf8IteratorGetCurrCodePoint — truncated sequence bounds checks
+ *
+ * Peek must never decode bytes beyond slen. For each truncated sequence, it
+ * should return errCh, set iter.error, and leave iter.next unchanged.
+ * ----------------------------------------------------------------------- */
+START_TEST(core_006)
+{
+	struct utf8Iterator iter;
+	cpUcs4 cp;
+
+	{
+		unsigned char data[] = { 0xC2, 0xA9 };
+		utf8IteratorInit(&iter, data, 1);
+		cp = utf8IteratorGetCurrCodePoint(&iter, '?');
+		ck_assert_int_eq(cp, '?');
+		ck_assert_int_eq(iter.error, 1);
+		ck_assert_int_eq(iter.next, 0);
+	}
+
+	{
+		unsigned char data[] = { 0xE2, 0x82, 0xAC };
+		utf8IteratorInit(&iter, data, 2);
+		cp = utf8IteratorGetCurrCodePoint(&iter, '?');
+		ck_assert_int_eq(cp, '?');
+		ck_assert_int_eq(iter.error, 1);
+		ck_assert_int_eq(iter.next, 0);
+	}
+
+	{
+		unsigned char data[] = { 0xF0, 0x9F, 0x98, 0x80 };
+		utf8IteratorInit(&iter, data, 3);
+		cp = utf8IteratorGetCurrCodePoint(&iter, '?');
+		ck_assert_int_eq(cp, '?');
+		ck_assert_int_eq(iter.error, 1);
+		ck_assert_int_eq(iter.next, 0);
+	}
+}
+END_TEST
+
+/* -----------------------------------------------------------------------
+ * core_007: utf8IteratorGetCurrCodePoint — peek without advancing
+ * ----------------------------------------------------------------------- */
+START_TEST(core_007)
 {
 	struct utf8Iterator iter;
 	unsigned char data[] = { 0xC2, 0xA9, 0x41 }; /* © A */
@@ -258,11 +342,11 @@ START_TEST(core_005)
 END_TEST
 
 /* -----------------------------------------------------------------------
- * core_006: utf8ScanBackwardsForCodePoint — various positions
+ * core_008: utf8ScanBackwardsForCodePoint — various positions
  *
  *   Data: © (0xC2 0xA9) at bytes 0-1, then 'A' (0x41) at byte 2
  * ----------------------------------------------------------------------- */
-START_TEST(core_006)
+START_TEST(core_008)
 {
 	unsigned char data[] = { 0xC2, 0xA9, 0x41 }; /* © A */
 	cpUcs4 out;
@@ -298,9 +382,9 @@ START_TEST(core_006)
 END_TEST
 
 /* -----------------------------------------------------------------------
- * core_007: buIsUTF8Content
+ * core_009: buIsUTF8Content
  * ----------------------------------------------------------------------- */
-START_TEST(core_007)
+START_TEST(core_009)
 {
 	bstring b;
 	int ret;
@@ -360,9 +444,9 @@ START_TEST(core_007)
 END_TEST
 
 /* -----------------------------------------------------------------------
- * core_008: buAppendBlkUcs4 — UCS-4 array → UTF-8 bstring
+ * core_010: buAppendBlkUcs4 — UCS-4 array → UTF-8 bstring
  * ----------------------------------------------------------------------- */
-START_TEST(core_008)
+START_TEST(core_010)
 {
 	bstring b;
 	int ret;
@@ -458,9 +542,9 @@ START_TEST(core_008)
 END_TEST
 
 /* -----------------------------------------------------------------------
- * core_009: buGetBlkUTF16 — UTF-8 bstring → UTF-16 array
+ * core_011: buGetBlkUTF16 — UTF-8 bstring → UTF-16 array
  * ----------------------------------------------------------------------- */
-START_TEST(core_009)
+START_TEST(core_011)
 {
 	cpUcs2 buf[16];
 	int ret;
@@ -537,9 +621,9 @@ START_TEST(core_009)
 END_TEST
 
 /* -----------------------------------------------------------------------
- * core_010: buAppendBlkUTF16 — UTF-16 array → UTF-8 bstring
+ * core_012: buAppendBlkUTF16 — UTF-16 array → UTF-8 bstring
  * ----------------------------------------------------------------------- */
-START_TEST(core_010)
+START_TEST(core_012)
 {
 	bstring b;
 	int ret;
@@ -726,7 +810,7 @@ START_TEST(core_010)
 END_TEST
 
 /* -----------------------------------------------------------------------
- * core_011: regression guard
+ * core_013: regression guard
  *
  * Guard against regressions for:
  *   high surrogate followed by non-low surrogate.
@@ -734,7 +818,7 @@ END_TEST
  * Expected behavior is:
  *   first code unit substituted with errCh, second processed normally.
  * ----------------------------------------------------------------------- */
-START_TEST(core_011)
+START_TEST(core_013)
 {
 	bstring b;
 	int ret;
@@ -773,6 +857,8 @@ main(void)
 	tcase_add_test(core, core_009);
 	tcase_add_test(core, core_010);
 	tcase_add_test(core, core_011);
+	tcase_add_test(core, core_012);
+	tcase_add_test(core, core_013);
 	suite_add_tcase(suite, core);
 	/* Run tests */
 	SRunner *runner = srunner_create(suite);

From 2ed0053d984aea3ea3e93f9642beea6e914e12b0 Mon Sep 17 00:00:00 2001
From: Daniel Markstedt <daniel@mindani.net>
Date: Tue, 24 Feb 2026 20:08:45 +0100
Subject: [PATCH 5/6] return when encountering invalid continuation bytes

---
 bstring/utf8util.c |  7 ++++++-
 tests/testutf8.c   | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/bstring/utf8util.c b/bstring/utf8util.c
index 3ce310f..d8a69f7 100644
--- a/bstring/utf8util.c
+++ b/bstring/utf8util.c
@@ -97,7 +97,6 @@ utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos,
 		return 0;
 	} else if (msg[pos] < 0xC0) {
 		if (0 == pos) return -__LINE__;
-		ret = -__LINE__;
 		if (msg[pos-1] >= 0xC1 && msg[pos-1] < 0xF8) {
 			pos--;
 			ret = 1;
@@ -119,6 +118,7 @@ utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos,
 	}
 	if (msg[pos] < 0xE0) {
 		if (pos + 1 >= len) return -__LINE__;
+		if ((msg[pos+1] & 0xC0) != 0x80) return -__LINE__;
 		v1 = msg[pos]   & ~0xE0;
 		v2 = msg[pos+1] & ~0xC0;
 		v1 = (v1 << 6) + v2;
@@ -128,6 +128,8 @@ utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos,
 	}
 	if (msg[pos] < 0xF0) {
 		if (pos + 2 >= len) return -__LINE__;
+		if ((msg[pos+1] & 0xC0) != 0x80) return -__LINE__;
+		if ((msg[pos+2] & 0xC0) != 0x80) return -__LINE__;
 		v1 = msg[pos]   & ~0xF0;
 		v2 = msg[pos+1] & ~0xC0;
 		v3 = msg[pos+2] & ~0xC0;
@@ -141,6 +143,9 @@ utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos,
 	if (msg[pos] >= 0xF8) return -__LINE__;
 
 	if (pos + 3 >= len) return -__LINE__;
+	if ((msg[pos+1] & 0xC0) != 0x80) return -__LINE__;
+	if ((msg[pos+2] & 0xC0) != 0x80) return -__LINE__;
+	if ((msg[pos+3] & 0xC0) != 0x80) return -__LINE__;
 	v1 = msg[pos]   & ~0xF8;
 	v2 = msg[pos+1] & ~0xC0;
 	v3 = msg[pos+2] & ~0xC0;
diff --git a/tests/testutf8.c b/tests/testutf8.c
index e5545a6..792f737 100644
--- a/tests/testutf8.c
+++ b/tests/testutf8.c
@@ -838,6 +838,40 @@ START_TEST(core_013)
 }
 END_TEST
 
+/* -----------------------------------------------------------------------
+ * core_013: utf8ScanBackwardsForCodePoint — invalid continuation bytes
+ *
+ * Each case starts at a lead byte but includes one or more non-continuation
+ * trailing bytes. Scanner must reject these and return an error.
+ * ----------------------------------------------------------------------- */
+START_TEST(core_014)
+{
+	cpUcs4 out = 0;
+	int ret;
+
+	/* Invalid 2-byte sequence: second byte must be 10xxxxxx */
+	{
+		unsigned char data[] = { 0xC2, 0x41 };
+		ret = utf8ScanBackwardsForCodePoint(data, 2, 0, &out);
+		ck_assert(ret < 0);
+	}
+
+	/* Invalid 3-byte sequence: middle byte must be 10xxxxxx */
+	{
+		unsigned char data[] = { 0xE2, 0x28, 0xAC };
+		ret = utf8ScanBackwardsForCodePoint(data, 3, 0, &out);
+		ck_assert(ret < 0);
+	}
+
+	/* Invalid 4-byte sequence: third byte must be 10xxxxxx */
+	{
+		unsigned char data[] = { 0xF0, 0x9F, 0x41, 0x80 };
+		ret = utf8ScanBackwardsForCodePoint(data, 4, 0, &out);
+		ck_assert(ret < 0);
+	}
+}
+END_TEST
+
 int
 main(void)
 {
@@ -859,6 +893,7 @@ main(void)
 	tcase_add_test(core, core_011);
 	tcase_add_test(core, core_012);
 	tcase_add_test(core, core_013);
+	tcase_add_test(core, core_014);
 	suite_add_tcase(suite, core);
 	/* Run tests */
 	SRunner *runner = srunner_create(suite);

From 01c0a19312569b805ffaf96aa6d4f97f36f4c70c Mon Sep 17 00:00:00 2001
From: Daniel Markstedt <daniel@mindani.net>
Date: Tue, 24 Feb 2026 20:14:59 +0100
Subject: [PATCH 6/6] uppercase integer literal notation to prevent ambiguity

---
 bstring/utf8util.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bstring/utf8util.c b/bstring/utf8util.c
index d8a69f7..8362d25 100644
--- a/bstring/utf8util.c
+++ b/bstring/utf8util.c
@@ -326,7 +326,7 @@ utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
 		if (iter->next + 2 >= iter->slen) {
 			invalid = 1;
 		} else {
-			v = (c << 12lu) - (0x0E0 << 12u);
+			v = (c << 12UL) - (0x0E0 << 12u);
 			c = (unsigned char) ((unsigned) chrs[1] - 0x080);
 			d = (unsigned char) ((unsigned) chrs[2] - 0x080);
 			v += (c << 6u) + d;
@@ -337,11 +337,11 @@ utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh)
 		if (iter->next + 3 >= iter->slen) {
 			invalid = 1;
 		} else {
-			v = (c << 18lu) - (0x0F0 << 18u);
+			v = (c << 18UL) - (0x0F0 << 18u);
 			c = (unsigned char) ((unsigned) chrs[1] - 0x080);
 			d = (unsigned char) ((unsigned) chrs[2] - 0x080);
 			e = (unsigned char) ((unsigned) chrs[3] - 0x080);
-			v += (c << 12lu) + (d << 6u) + e;
+			v += (c << 12UL) + (d << 6u) + e;
 			if ((c|d|e) >= 0x40 || v < 0x10000 ||
 			    !isLegalUnicodeCodePoint(v)) invalid = 1;
 		}