Skip to content
Permalink
Browse files

Use uniscribe to calculate character offsets where allowed (#10550)

* OffsetsTextInfo._getCharacterOffsets: use uniscribe where possible to calculate the bounds for a character. This allows us to treat something like e-acute as one character.

* Add copyright header to textUtils.cpp

* Fix linting issues.

* Restore some accidentally removed code from OffsetsTextInfo._getCharacterOffsets, allowing unit tests to pass again.

* nvdaHelperLocal's textUtils.cpp: abstract out code to avoid duplicate code in both calculateWordOffsets and calculateCharacterOffsets.

* Address review actions.
  • Loading branch information
michaelDCurran committed Nov 28, 2019
1 parent 4cc1643 commit 1045d2de2d0e669decee0237a77ebba40a8eb57f
Showing with 145 additions and 74 deletions.
  1. +1 −0 nvdaHelper/local/nvdaHelperLocal.def
  2. +85 −39 nvdaHelper/local/textUtils.cpp
  3. +59 −35 source/textInfos/offsets.py
@@ -51,6 +51,7 @@ EXPORTS
displayModel_getCaretRect
displayModel_requestTextChangeNotificationsForWindow
calculateWordOffsets
calculateCharacterOffsets
findWindowWithClassInThread
registerUIAProperty
dllImportTableHooks_hookSingle
@@ -1,7 +1,31 @@
/*
This file is a part of the NVDA project.
URL: http://www.nvda-project.org/
Copyright 2008-2019 NV Access Limited.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License version 2.0, as published by
the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
This license can be found at:
http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
*/

#include <windows.h>
#include <usp10.h>
#include <common/log.h>

bool calculateWordOffsets(wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
enum UNIT {
UNIT_CHARACTER,
UNIT_WORD
};

bool _calculateUniscribeOffsets(enum UNIT unit, wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
if(unit!=UNIT_CHARACTER&&unit!=UNIT_WORD) {
LOG_ERROR(L"Unsupported unit");
return false;
}
if(textLength<=0) return false;
if(offset<0) return false;
if(offset>=textLength) {
@@ -27,51 +51,73 @@ bool calculateWordOffsets(wchar_t* text, int textLength, int offset, int* startO
}
}
delete[] pItems;
for(int i=offset;i>=0;--i) {
if(logAttrArray[i].fWordStop) {
*startOffset=i;
break;
if(unit==UNIT_CHARACTER) {
for(int i=offset;i>=0;--i) {
if(logAttrArray[i].fCharStop) {
*startOffset=i;
break;
}
}
}
// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
bool skipWhitespace=true;
for(int i=offset;i>=*startOffset;--i) {
if(iswspace(text[i])) {
if(skipWhitespace) {
// If we start in a block of whitespace, the word must start before this,
// as whitespace is included at the end of a word.
// Therefore, skip the whitespace and keep searching.
continue;
for(int i=offset+1;i<textLength;++i) {
if(logAttrArray[i].fCharStop) {
*endOffset=i;
break;
}
// This is whitespace. The word starts after it.
*startOffset=i+1;
break;
} else
skipWhitespace=false;
}
*endOffset=textLength;
for(int i=offset+1;i<textLength;++i) {
if(logAttrArray[i].fWordStop) {
*endOffset=i;
break;
}
}
// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
for(int i=offset;i<*endOffset;++i) {
if(iswspace(text[i])) {
// This begins a block of whitespace. The word ends after it.
// Find the end of the whitespace.
for(;i<*endOffset;++i) {
if(!iswspace(text[i]))
break;
} else if(unit==UNIT_WORD) {
for(int i=offset;i>=0;--i) {
if(logAttrArray[i].fWordStop) {
*startOffset=i;
break;
}
}
// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
bool skipWhitespace=true;
for(int i=offset;i>=*startOffset;--i) {
if(iswspace(text[i])) {
if(skipWhitespace) {
// If we start in a block of whitespace, the word must start before this,
// as whitespace is included at the end of a word.
// Therefore, skip the whitespace and keep searching.
continue;
}
// This is whitespace. The word starts after it.
*startOffset=i+1;
break;
} else
skipWhitespace=false;
}
*endOffset=textLength;
for(int i=offset+1;i<textLength;++i) {
if(logAttrArray[i].fWordStop) {
*endOffset=i;
break;
}
}
// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
for(int i=offset;i<*endOffset;++i) {
if(iswspace(text[i])) {
// This begins a block of whitespace. The word ends after it.
// Find the end of the whitespace.
for(;i<*endOffset;++i) {
if(!iswspace(text[i]))
break;
}
// We're now positioned on the first non-whitespace character,
// so the word ends here.
*endOffset=i;
break;
}
// We're now positioned on the first non-whitespace character,
// so the word ends here.
*endOffset=i;
break;
}
}
delete[] logAttrArray;
return true;
}

bool calculateWordOffsets(wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
return _calculateUniscribeOffsets(UNIT_WORD, text, textLength, offset, startOffset, endOffset);
}

bool calculateCharacterOffsets(wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
return _calculateUniscribeOffsets(UNIT_CHARACTER, text, textLength, offset, startOffset, endOffset);
}
@@ -16,7 +16,7 @@
import api
import textUtils
from dataclasses import dataclass
from typing import Optional
from typing import Optional, Tuple
import locale
from logHandler import log

@@ -301,18 +301,66 @@ def _getFormatFieldAndOffsets(self,offset,formatConfig,calculateOffsets=True):
formatField["line-number"]=lineNum+1
return formatField,(startOffset,endOffset)

def _getCharacterOffsets(self,offset):
def _calculateUniscribeOffsets(self, lineText: str, unit: str, relOffset: int) -> Optional[Tuple[int, int]]:
"""
Calculates the bounds of a unit at an offset within a given string of text
using the Windows uniscribe library, also used in Notepad, for example.
Units supported are character and word.
@param lineText: the text string to analyze
@param unit: the TextInfo unit (character or word)
@param relOffset: the character offset within the text string at which to calculate the bounds.
"""
if unit is textInfos.UNIT_WORD:
helperFunc = NVDAHelper.localLib.calculateWordOffsets
elif unit is textInfos.UNIT_CHARACTER:
helperFunc = NVDAHelper.localLib.calculateCharacterOffsets
else:
raise NotImplementedError(f"Unit: {unit}")
relStart = ctypes.c_int()
relEnd = ctypes.c_int()
# uniscribe does some strange things
# when you give it a string with not more than two alphanumeric chars in a row.
# Inject two alphanumeric characters at the end to fix this
uniscribeLineText = lineText + "xx"
# We can't rely on len(lineText) to calculate the length of the line.
offsetConverter = textUtils.WideStringOffsetConverter(lineText)
lineLength = offsetConverter.wideStringLength
if self.encoding != textUtils.WCHAR_ENCODING:
# We need to convert the str based line offsets to wide string offsets.
relOffset = offsetConverter.strToWideOffsets(relOffset, relOffset)[0]
uniscribeLineLength = lineLength + 2
if helperFunc(
uniscribeLineText,
uniscribeLineLength,
relOffset,
ctypes.byref(relStart),
ctypes.byref(relEnd)
):
relStart = relStart.value
relEnd = min(lineLength, relEnd.value)
if self.encoding != textUtils.WCHAR_ENCODING:
# We need to convert the uniscribe based offsets to str offsets.
relStart, relEnd = offsetConverter.wideToStrOffsets(relStart, relEnd)
return (relStart, relEnd)
log.debugWarning(f"Uniscribe failed to calculate {unit} offsets for text {lineText!r}")
return None

def _getCharacterOffsets(self, offset):
if self.encoding not in (textUtils.WCHAR_ENCODING, None, "utf_32_le", locale.getlocale()[1]):
raise NotImplementedError
lineStart, lineEnd = self._getLineOffsets(offset)
lineText = self._getTextRange(lineStart, lineEnd)
relOffset = offset - lineStart
if self.useUniscribe:
offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_CHARACTER, relOffset)
if offsets is not None:
return (offsets[0] + lineStart, offsets[1] + lineStart)
if self.encoding == textUtils.WCHAR_ENCODING:
lineStart,lineEnd=self._getLineOffsets(offset)
lineText=self._getTextRange(lineStart,lineEnd)
offsetConverter = textUtils.WideStringOffsetConverter(lineText)
relOffset = offset - lineStart
relStrStart, relStrEnd = offsetConverter.wideToStrOffsets(relOffset, relOffset + 1)
relWideStringStart, relWideStringEnd = offsetConverter.strToWideOffsets(relStrStart, relStrEnd)
return (relWideStringStart + lineStart, relWideStringEnd + lineStart)
elif self.encoding not in (None, "utf_32_le", locale.getlocale()[1]):
raise NotImplementedError
return offset, offset + 1
return (offset, offset + 1)

def _getWordOffsets(self,offset):
if self.encoding not in (textUtils.WCHAR_ENCODING, None, "utf_32_le", locale.getlocale()[1]):
@@ -323,33 +371,9 @@ def _getWordOffsets(self,offset):
lineText = lineText.translate({0:u' ',0xa0:u' '})
relOffset = offset - lineStart
if self.useUniscribe:
relStart=ctypes.c_int()
relEnd=ctypes.c_int()
# uniscribe does some strange things when you give it a string with not more than two alphanumeric chars in a row.
# Inject two alphanumeric characters at the end to fix this
uniscribeLineText = lineText + "xx"
# We can't rely on len(lineText) to calculate the length of the line.
if self.encoding != textUtils.WCHAR_ENCODING:
# We need to convert the str based line offsets to wide string offsets.
offsetConverter = textUtils.WideStringOffsetConverter(lineText)
lineLength = offsetConverter.wideStringLength
relOffset = offsetConverter.strToWideOffsets(relOffset, relOffset)[0]
else:
lineLength = (lineEnd - lineStart)
uniscribeLineLength = lineLength + 2
if NVDAHelper.localLib.calculateWordOffsets(
uniscribeLineText,
uniscribeLineLength,
relOffset,
ctypes.byref(relStart),
ctypes.byref(relEnd)
):
relStart = relStart.value
relEnd = min(lineLength, relEnd.value)
if self.encoding != textUtils.WCHAR_ENCODING:
# We need to convert the uniscribe based offsets to str offsets.
relStart, relEnd = offsetConverter.wideToStrOffsets(relStart, relEnd)
return (relStart + lineStart , relEnd + lineStart)
offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_WORD, relOffset)
if offsets is not None:
return (offsets[0] + lineStart, offsets[1] + lineStart)
#Fall back to the older word offsets detection that only breaks on non alphanumeric
if self.encoding == textUtils.WCHAR_ENCODING:
offsetConverter = textUtils.WideStringOffsetConverter(lineText)

0 comments on commit 1045d2d

Please sign in to comment.
You can’t perform that action at this time.