Fix issues in uniscribe code (#18744)

LeonarddeR · web-flow · commit 92f345eaebc7 · 2025-08-20T11:02:55.000+10:00
Closes #18722 Reverts 021bcd8 Summary of the issue: The code that split a string at character boundaries discarded characters like \u0301 when such a character was the only character in a string. Description of user facing changes: When unicode normalization is on, navigating by character will again correctly announce characters like acute (\u0301) Description of developer facing changes: NVDAHelper local calculateCharacterBoundaries now needs an offsets array that is textLength + 1 in size, rather than only the text length. This is to store the end offset of the last character. This reverts 021bcd8, therefore we don't pass additional extraneous alphanumeric characters to uniscribe to make it happy. Description of development approach: While debugging this issue, I found several issues in NVDAHelper local textUtils _getLogAttrArray function that is the base of the several calculation functions: The buffer passed to ScriptItemize was too small when a string only contained one character. The buffer should have space for at least two SCRIPT_ITEM structures in size. This is my hypothesis about the necessity of 021bcd8, namely the addition of two alphanumeric characters passed to the uniscribe methods. While there are no exactly known str to reproduce the issue behind 021bcd8, I tried several combinations of shorter and longer strings without alpha numeric characters, and I couldn't reproduce any of the behavior described in it after my changes to the c++ code. Therefore I reverted 021bcd8, as there is enough time to test this in Alpha thoroughly. Calculation of character offsets would never set the end offset when requesting the offsets of the last character in the string, since the end offset calculation starts at offset + 1 and offset + 1 is equal to textLength. Failed calls of ScriptItemize and ScriptBreak were never logged. ScriptItemize definitely failed when passing a string with only one character in it because the expected buffer was to small.
diff --git a/nvdaHelper/local/textUtils.cpp b/nvdaHelper/local/textUtils.cpp
@@ -28,20 +28,34 @@ vector<SCRIPT_LOGATTR> _getLogAttrArray(const wchar_t* text, int textLength) {
 	if (textLength <= 0 || !text) {
 		return {};
 	}
-	vector<SCRIPT_ITEM> items(textLength + 1);
+	// It is invalid to call ScriptItemize with a buffer to hold less than two SCRIPT_ITEM structures.
+	auto cMaxItems = textLength + 1;
+	// The buffer should be (cMaxItems + 1)
+	vector<SCRIPT_ITEM> items(cMaxItems + 1);
 	int numItems = 0;
-	if (ScriptItemize(text, textLength, textLength, nullptr, nullptr, items.data(), &numItems) != S_OK || numItems == 0) {
+	HRESULT hr;
+	if ((hr = ScriptItemize(text, textLength, cMaxItems, nullptr, nullptr, items.data(), &numItems)) != S_OK || numItems == 0) {
+		LOG_ERROR(L"ScriptItemize failed for text '" << text << L"'; hr=" << hr);
 		return {};
 	}
 
 	vector<SCRIPT_LOGATTR> logAttrArray(textLength);
-	int nextICharPos = textLength;
+	// The function always adds a terminal item to the item analysis array.
+	// numItems contains the number of actually processed items excluding the terminating item.
+	int nextICharPos = textLength;  // should be equal to items[numItems].iCharPos
 	for (int itemIndex = numItems - 1; itemIndex >= 0; --itemIndex) {
 		int iCharPos = items[itemIndex].iCharPos;
 		int iCharLength = nextICharPos - iCharPos;
-		if (ScriptBreak(text + iCharPos, iCharLength, &(items[itemIndex].a), logAttrArray.data() + iCharPos) != S_OK) {
+		if ((hr = ScriptBreak(text + iCharPos, iCharLength, &(items[itemIndex].a), logAttrArray.data() + iCharPos)) != S_OK) {
+			LOG_ERROR(L"ScriptBreak failed for text '" << text << L"' at run " << itemIndex << L"; hr=" << hr);
 			return {};
 		}
+		// Note, ideally we'd set nextICharPos to iCharPos, so that the
+		// next iteration of the loop will only call ScriptBreak for the text that belongs to the current item.
+		// Now that we don't do this, every call of ScriptBreak refills logAttrArray
+		// for the characters after this item based on the SCRIPT_ANALYSIS for the current item,
+		// effectively treating all the characters as belonging to the script at itemIndex = 0.
+		// However, resetting nextICharPos causes word segmentation to differ from the one used in notepad.
 	}
 	return logAttrArray;
 }
@@ -55,8 +69,8 @@ bool calculateCharacterBoundaries(const wchar_t* text, int textLength, int* offs
 		return false;
 	}
 	int count = 0;
-	for (int i = 0; i < textLength; ++i) {
-		if (logAttrArray[i].fCharStop) {
+	for (int i = 0; i <= textLength; ++i) {
+		if (i == textLength || logAttrArray[i].fCharStop) {
 			offsets[count++] = i;
 		}
 	}
@@ -88,6 +102,7 @@ bool _calculateUniscribeOffsets(enum UNIT unit, wchar_t* text, int textLength, i
 				break;
 			}
 		}
+		*endOffset = textLength;
 		for(int i=offset+1;i<textLength;++i) {
 			if (logAttrArray[i].fCharStop) {
 				*endOffset=i;
diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py
@@ -344,20 +344,15 @@ def _calculateUniscribeOffsets(
 			raise NotImplementedError(f"Unit: {unit}")
 		relStart = ctypes.c_int()
 		relEnd = ctypes.c_int()
-		# uniscribe does some strange things
-		# when you give it a string  with not more than two alphanumeric chars in a row.
-		# Inject two alphanumeric characters at the end to fix this
-		uniscribeLineText = lineText + "xx"
 		# We can't rely on len(lineText) to calculate the length of the line.
 		offsetConverter = textUtils.WideStringOffsetConverter(lineText)
 		lineLength = offsetConverter.encodedStringLength
 		if self.encoding != textUtils.WCHAR_ENCODING:
 			# We need to convert the str based line offsets to wide string offsets.
 			relOffset = offsetConverter.strToEncodedOffsets(relOffset, relOffset)[0]
-		uniscribeLineLength = lineLength + 2
 		if helperFunc(
-			uniscribeLineText,
-			uniscribeLineLength,
+			lineText,
+			lineLength,
 			relOffset,
 			ctypes.byref(relStart),
 			ctypes.byref(relEnd),
diff --git a/source/textUtils/uniscribe.py b/source/textUtils/uniscribe.py
@@ -20,14 +20,10 @@ def splitAtCharacterBoundaries(text: str) -> Generator[str, None, None]:
 		raise RuntimeError("NVDAHelper not initialized")
 	if not text:
 		return
-	# uniscribe does some strange things
-	# when you give it a string with not more than two alphanumeric chars in a row.
-	# Inject two alphanumeric characters at the end to fix this
-	uniscribeText = text + "xx"
-	buffer = ctypes.create_unicode_buffer(uniscribeText)
+	buffer = ctypes.create_unicode_buffer(text)
 	textLength = len(buffer) - 1  # Length without terminating NULL character
 	offsetsCount = ctypes.c_int()
-	offsets = (ctypes.c_int * textLength)()
+	offsets = (ctypes.c_int * (textLength + 1))()
 	if not NVDAHelper.localLib.calculateCharacterBoundaries(
 		buffer,
 		textLength,
@@ -36,7 +32,7 @@ def splitAtCharacterBoundaries(text: str) -> Generator[str, None, None]:
 	):
 		raise RuntimeError("NVDAHelper calculateCharacterBoundaries failed")
 	# Get the end offsets of the characters we need.
-	calculatedOffsets = offsets[1 : (offsetsCount.value - 1)]
+	calculatedOffsets = offsets[1 : (offsetsCount.value + 1)]
 	start = 0
 	for end in calculatedOffsets:
 		yield buffer[start:end]
diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md
@@ -20,6 +20,8 @@ This can be enabled using the "Report when lists support multiple selection" set
 
 ### Bug Fixes
 
+* When unicode normalization is enabled for speech, navigating by character will again correctly announce combining diacritic characters like acute ( &#x0301; ). (#18722, @LeonarddeR)
+
 ### Changes for Developers
 
 Please refer to [the developer guide](https://download.nvaccess.org/documentation/developerGuide.html#API) for information on NVDA's API deprecation and removal process.