nvaccess · michaelDCurran · Nov 28, 2019 · Nov 26, 2019 · Nov 26, 2019 · Nov 26, 2019
@@ -51,6 +51,7 @@ EXPORTS
 	displayModel_getCaretRect
 	displayModel_requestTextChangeNotificationsForWindow
 	calculateWordOffsets
+	calculateCharacterOffsets
 	findWindowWithClassInThread
 	registerUIAProperty
 	dllImportTableHooks_hookSingle

@@ -1,7 +1,31 @@
+/*
+This file is a part of the NVDA project.
+URL: http://www.nvda-project.org/
+Copyright 2008-2019 NV Access Limited.
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License version 2.0, as published by
+    the Free Software Foundation.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+This license can be found at:
+http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
+*/
+
 #include <windows.h>
 #include <usp10.h>
+#include <common/log.h>
 
-bool calculateWordOffsets(wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
+enum UNIT {
+	UNIT_CHARACTER,
+	UNIT_WORD
+};
+
+bool _calculateUniscribeOffsets(enum UNIT unit, wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
+	if(unit!=UNIT_CHARACTER&&unit!=UNIT_WORD) {
+		LOG_ERROR(L"Unsupported unit");
+		return false;
+	}
 	if(textLength<=0) return false;
 	if(offset<0) return false;
 	if(offset>=textLength) {
@@ -27,51 +51,73 @@ bool calculateWordOffsets(wchar_t* text, int textLength, int offset, int* startO
 		}
 	}
 	delete[] pItems;
-	for(int i=offset;i>=0;--i) {
-		if(logAttrArray[i].fWordStop) {
-			*startOffset=i;
-			break;
+	if(unit==UNIT_CHARACTER) {
+		for(int i=offset;i>=0;--i) {
+			if(logAttrArray[i].fCharStop) {
+				*startOffset=i;
+				break;
+			}
 		}
-	}
-	// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
-	bool skipWhitespace=true;
-	for(int i=offset;i>=*startOffset;--i) {
-		if(iswspace(text[i])) {
-			if(skipWhitespace) {
-				// If we start in a block of whitespace, the word must start before this,
-				// as whitespace is included at the end of a word.
-				// Therefore, skip the whitespace and keep searching.
-				continue;
+		for(int i=offset+1;i<textLength;++i) {
+			if(logAttrArray[i].fCharStop) {
+				*endOffset=i;
+				break;
 			}
-			// This is whitespace. The word starts after it.
-			*startOffset=i+1;
-			break;
-		} else
-			skipWhitespace=false;
-	}
-	*endOffset=textLength;
-	for(int i=offset+1;i<textLength;++i) {
-		if(logAttrArray[i].fWordStop) {
-			*endOffset=i;
-			break;
 		}
-	}
-	// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
-	for(int i=offset;i<*endOffset;++i) {
-		if(iswspace(text[i])) {
-			// This begins a block of whitespace. The word ends after it.
-			// Find the end of the whitespace.
-			for(;i<*endOffset;++i) {
-				if(!iswspace(text[i]))
-					break;
+	} else if(unit==UNIT_WORD) {
+		for(int i=offset;i>=0;--i) {
+			if(logAttrArray[i].fWordStop) {
+				*startOffset=i;
+				break;
+			}
+		}
+		// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
+		bool skipWhitespace=true;
+		for(int i=offset;i>=*startOffset;--i) {
+			if(iswspace(text[i])) {
+				if(skipWhitespace) {
+					// If we start in a block of whitespace, the word must start before this,
+					// as whitespace is included at the end of a word.
+					// Therefore, skip the whitespace and keep searching.
+					continue;
+				}
+				// This is whitespace. The word starts after it.
+				*startOffset=i+1;
+				break;
+			} else
+				skipWhitespace=false;
+		}
+		*endOffset=textLength;
+		for(int i=offset+1;i<textLength;++i) {
+			if(logAttrArray[i].fWordStop) {
+				*endOffset=i;
+				break;
+			}
+		}
+		// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
+		for(int i=offset;i<*endOffset;++i) {
+			if(iswspace(text[i])) {
+				// This begins a block of whitespace. The word ends after it.
+				// Find the end of the whitespace.
+				for(;i<*endOffset;++i) {
+					if(!iswspace(text[i]))
+						break;
+				}
+				// We're now positioned on the first non-whitespace character,
+				// so the word ends here.
+				*endOffset=i;
+				break;
 			}
-			// We're now positioned on the first non-whitespace character,
-			// so the word ends here.
-			*endOffset=i;
-			break;
 		}
 	}
 	delete[] logAttrArray;
 	return true;
 }
 
+bool calculateWordOffsets(wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
+	return _calculateUniscribeOffsets(UNIT_WORD, text, textLength, offset, startOffset, endOffset);
+}
+
+bool calculateCharacterOffsets(wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
+	return _calculateUniscribeOffsets(UNIT_CHARACTER, text, textLength, offset, startOffset, endOffset);
+}
@@ -16,7 +16,7 @@
 import api
 import textUtils
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, Tuple
 import locale
 from logHandler import log
 
@@ -301,18 +301,66 @@ def _getFormatFieldAndOffsets(self,offset,formatConfig,calculateOffsets=True):
 				formatField["line-number"]=lineNum+1
 		return formatField,(startOffset,endOffset)
 
-	def _getCharacterOffsets(self,offset):
+	def _calculateUniscribeOffsets(self, lineText: str, unit: str, relOffset: int) -> Optional[Tuple[int, int]]:
+		"""
+		Calculates the bounds of a unit at an offset within a given string of text
+		using the Windows uniscribe  library, also used in Notepad, for example.
+		Units supported are character and word.
+		@param lineText: the text string to analyze
+		@param unit: the TextInfo unit (character or word)
+		@param relOffset: the character offset within the text string at which to calculate the bounds.
+		"""
+		if unit is textInfos.UNIT_WORD:
+			helperFunc = NVDAHelper.localLib.calculateWordOffsets
+		elif unit is textInfos.UNIT_CHARACTER:
+			helperFunc = NVDAHelper.localLib.calculateCharacterOffsets
+		else:
+			raise NotImplementedError(f"Unit: {unit}")
+		relStart = ctypes.c_int()
+		relEnd = ctypes.c_int()
+		# uniscribe does some strange things
+		# when you give it a string  with not more than two alphanumeric chars in a row.
+		# Inject two alphanumeric characters at the end to fix this
+		uniscribeLineText = lineText + "xx"
+		# We can't rely on len(lineText) to calculate the length of the line.
+		offsetConverter = textUtils.WideStringOffsetConverter(lineText)
+		lineLength = offsetConverter.wideStringLength
+		if self.encoding != textUtils.WCHAR_ENCODING:
+			# We need to convert the str based line offsets to wide string offsets.
+			relOffset = offsetConverter.strToWideOffsets(relOffset, relOffset)[0]
+		uniscribeLineLength = lineLength + 2
+		if helperFunc(
+			uniscribeLineText,
+			uniscribeLineLength,
+			relOffset,
+			ctypes.byref(relStart),
+			ctypes.byref(relEnd)
+		):
+			relStart = relStart.value
+			relEnd = min(lineLength, relEnd.value)
+			if self.encoding != textUtils.WCHAR_ENCODING:
+				# We need to convert the uniscribe based offsets to str offsets.
+				relStart, relEnd = offsetConverter.wideToStrOffsets(relStart, relEnd)
+			return (relStart, relEnd)
+		log.debugWarning(f"Uniscribe failed to calculate {unit} offsets for text {lineText!r}")
+		return None
+
+	def _getCharacterOffsets(self, offset):
+		if self.encoding not in (textUtils.WCHAR_ENCODING, None, "utf_32_le", locale.getlocale()[1]):
+			raise NotImplementedError
+		lineStart, lineEnd = self._getLineOffsets(offset)
+		lineText = self._getTextRange(lineStart, lineEnd)
+		relOffset = offset - lineStart
+		if self.useUniscribe:
+			offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_CHARACTER, relOffset)
+			if offsets is not None:
+				return (offsets[0] + lineStart, offsets[1] + lineStart)
 		if self.encoding == textUtils.WCHAR_ENCODING:
-			lineStart,lineEnd=self._getLineOffsets(offset)
-			lineText=self._getTextRange(lineStart,lineEnd)
 			offsetConverter = textUtils.WideStringOffsetConverter(lineText)
-			relOffset = offset - lineStart
 			relStrStart, relStrEnd = offsetConverter.wideToStrOffsets(relOffset, relOffset + 1)
 			relWideStringStart, relWideStringEnd = offsetConverter.strToWideOffsets(relStrStart, relStrEnd)
 			return (relWideStringStart + lineStart, relWideStringEnd + lineStart)
-		elif self.encoding not in (None, "utf_32_le", locale.getlocale()[1]):
-			raise NotImplementedError
-		return offset, offset + 1
+		return (offset, offset + 1)
 
 	def _getWordOffsets(self,offset):
 		if self.encoding not in (textUtils.WCHAR_ENCODING, None, "utf_32_le", locale.getlocale()[1]):
@@ -323,33 +371,9 @@ def _getWordOffsets(self,offset):
 		lineText = lineText.translate({0:u' ',0xa0:u' '})
 		relOffset = offset - lineStart
 		if self.useUniscribe:
-			relStart=ctypes.c_int()
-			relEnd=ctypes.c_int()
-			# uniscribe does some strange things when you give it a string  with not more than two alphanumeric chars in a row.
-			# Inject two alphanumeric characters at the end to fix this 
-			uniscribeLineText = lineText + "xx"
-			# We can't rely on len(lineText) to calculate the length of the line.
-			if self.encoding != textUtils.WCHAR_ENCODING:
-				# We need to convert the str based line offsets to wide string offsets.
-				offsetConverter = textUtils.WideStringOffsetConverter(lineText)
-				lineLength = offsetConverter.wideStringLength
-				relOffset = offsetConverter.strToWideOffsets(relOffset, relOffset)[0]
-			else:
-				lineLength = (lineEnd - lineStart)
-			uniscribeLineLength = lineLength + 2
-			if NVDAHelper.localLib.calculateWordOffsets(
-				uniscribeLineText,
-				uniscribeLineLength,
-				relOffset,
-				ctypes.byref(relStart),
-				ctypes.byref(relEnd)
-			):
-				relStart = relStart.value
-				relEnd = min(lineLength, relEnd.value)
-				if self.encoding != textUtils.WCHAR_ENCODING:
-					# We need to convert the uniscribe based offsets to str offsets.
-					relStart, relEnd = offsetConverter.wideToStrOffsets(relStart, relEnd)
-				return (relStart + lineStart , relEnd + lineStart)
+			offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_WORD, relOffset)
+			if offsets is not None:
+				return (offsets[0] + lineStart, offsets[1] + lineStart)
 		#Fall back to the older word offsets detection that only breaks on non alphanumeric
 		if self.encoding == textUtils.WCHAR_ENCODING:
 			offsetConverter = textUtils.WideStringOffsetConverter(lineText)