Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use uniscribe to calculate character offsets where allowed #10550

Merged
merged 6 commits into from Nov 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions nvdaHelper/local/nvdaHelperLocal.def
Expand Up @@ -51,6 +51,7 @@ EXPORTS
displayModel_getCaretRect
displayModel_requestTextChangeNotificationsForWindow
calculateWordOffsets
calculateCharacterOffsets
findWindowWithClassInThread
registerUIAProperty
dllImportTableHooks_hookSingle
Expand Down
124 changes: 85 additions & 39 deletions nvdaHelper/local/textUtils.cpp
@@ -1,7 +1,31 @@
/*
This file is a part of the NVDA project.
URL: http://www.nvda-project.org/
Copyright 2008-2019 NV Access Limited.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License version 2.0, as published by
the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
This license can be found at:
http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
*/

#include <windows.h>
#include <usp10.h>
#include <common/log.h>

bool calculateWordOffsets(wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
enum UNIT {
UNIT_CHARACTER,
UNIT_WORD
};

bool _calculateUniscribeOffsets(enum UNIT unit, wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
if(unit!=UNIT_CHARACTER&&unit!=UNIT_WORD) {
LOG_ERROR(L"Unsupported unit");
return false;
}
if(textLength<=0) return false;
if(offset<0) return false;
if(offset>=textLength) {
Expand All @@ -27,51 +51,73 @@ bool calculateWordOffsets(wchar_t* text, int textLength, int offset, int* startO
}
}
delete[] pItems;
for(int i=offset;i>=0;--i) {
if(logAttrArray[i].fWordStop) {
*startOffset=i;
break;
if(unit==UNIT_CHARACTER) {
for(int i=offset;i>=0;--i) {
if(logAttrArray[i].fCharStop) {
*startOffset=i;
break;
}
}
}
// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
bool skipWhitespace=true;
for(int i=offset;i>=*startOffset;--i) {
if(iswspace(text[i])) {
if(skipWhitespace) {
// If we start in a block of whitespace, the word must start before this,
// as whitespace is included at the end of a word.
// Therefore, skip the whitespace and keep searching.
continue;
for(int i=offset+1;i<textLength;++i) {
if(logAttrArray[i].fCharStop) {
*endOffset=i;
break;
}
// This is whitespace. The word starts after it.
*startOffset=i+1;
break;
} else
skipWhitespace=false;
}
*endOffset=textLength;
for(int i=offset+1;i<textLength;++i) {
if(logAttrArray[i].fWordStop) {
*endOffset=i;
break;
}
}
// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
for(int i=offset;i<*endOffset;++i) {
if(iswspace(text[i])) {
// This begins a block of whitespace. The word ends after it.
// Find the end of the whitespace.
for(;i<*endOffset;++i) {
if(!iswspace(text[i]))
break;
} else if(unit==UNIT_WORD) {
for(int i=offset;i>=0;--i) {
if(logAttrArray[i].fWordStop) {
*startOffset=i;
break;
}
}
// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
bool skipWhitespace=true;
for(int i=offset;i>=*startOffset;--i) {
if(iswspace(text[i])) {
if(skipWhitespace) {
// If we start in a block of whitespace, the word must start before this,
// as whitespace is included at the end of a word.
// Therefore, skip the whitespace and keep searching.
continue;
}
// This is whitespace. The word starts after it.
*startOffset=i+1;
break;
} else
skipWhitespace=false;
}
*endOffset=textLength;
for(int i=offset+1;i<textLength;++i) {
if(logAttrArray[i].fWordStop) {
*endOffset=i;
break;
}
}
// #1656: fWordStop doesn't seem to stop on whitespace where punctuation follows the whitespace.
for(int i=offset;i<*endOffset;++i) {
if(iswspace(text[i])) {
// This begins a block of whitespace. The word ends after it.
// Find the end of the whitespace.
for(;i<*endOffset;++i) {
if(!iswspace(text[i]))
break;
}
// We're now positioned on the first non-whitespace character,
// so the word ends here.
*endOffset=i;
break;
}
// We're now positioned on the first non-whitespace character,
// so the word ends here.
*endOffset=i;
break;
}
}
delete[] logAttrArray;
return true;
}

bool calculateWordOffsets(wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
return _calculateUniscribeOffsets(UNIT_WORD, text, textLength, offset, startOffset, endOffset);
}

bool calculateCharacterOffsets(wchar_t* text, int textLength, int offset, int* startOffset, int* endOffset) {
return _calculateUniscribeOffsets(UNIT_CHARACTER, text, textLength, offset, startOffset, endOffset);
}
94 changes: 59 additions & 35 deletions source/textInfos/offsets.py
Expand Up @@ -16,7 +16,7 @@
import api
import textUtils
from dataclasses import dataclass
from typing import Optional
from typing import Optional, Tuple
import locale
from logHandler import log

Expand Down Expand Up @@ -301,18 +301,66 @@ def _getFormatFieldAndOffsets(self,offset,formatConfig,calculateOffsets=True):
formatField["line-number"]=lineNum+1
return formatField,(startOffset,endOffset)

def _getCharacterOffsets(self,offset):
def _calculateUniscribeOffsets(self, lineText: str, unit: str, relOffset: int) -> Optional[Tuple[int, int]]:
"""
Calculates the bounds of a unit at an offset within a given string of text
using the Windows uniscribe library, also used in Notepad, for example.
Units supported are character and word.
@param lineText: the text string to analyze
@param unit: the TextInfo unit (character or word)
@param relOffset: the character offset within the text string at which to calculate the bounds.
"""
if unit is textInfos.UNIT_WORD:
helperFunc = NVDAHelper.localLib.calculateWordOffsets
elif unit is textInfos.UNIT_CHARACTER:
helperFunc = NVDAHelper.localLib.calculateCharacterOffsets
else:
raise NotImplementedError(f"Unit: {unit}")
relStart = ctypes.c_int()
relEnd = ctypes.c_int()
# uniscribe does some strange things
LeonarddeR marked this conversation as resolved.
Show resolved Hide resolved
# when you give it a string with not more than two alphanumeric chars in a row.
# Inject two alphanumeric characters at the end to fix this
uniscribeLineText = lineText + "xx"
# We can't rely on len(lineText) to calculate the length of the line.
offsetConverter = textUtils.WideStringOffsetConverter(lineText)
lineLength = offsetConverter.wideStringLength
if self.encoding != textUtils.WCHAR_ENCODING:
# We need to convert the str based line offsets to wide string offsets.
relOffset = offsetConverter.strToWideOffsets(relOffset, relOffset)[0]
uniscribeLineLength = lineLength + 2
if helperFunc(
uniscribeLineText,
uniscribeLineLength,
relOffset,
ctypes.byref(relStart),
ctypes.byref(relEnd)
):
relStart = relStart.value
relEnd = min(lineLength, relEnd.value)
if self.encoding != textUtils.WCHAR_ENCODING:
# We need to convert the uniscribe based offsets to str offsets.
relStart, relEnd = offsetConverter.wideToStrOffsets(relStart, relEnd)
return (relStart, relEnd)
log.debugWarning(f"Uniscribe failed to calculate {unit} offsets for text {lineText!r}")
LeonarddeR marked this conversation as resolved.
Show resolved Hide resolved
return None

def _getCharacterOffsets(self, offset):
if self.encoding not in (textUtils.WCHAR_ENCODING, None, "utf_32_le", locale.getlocale()[1]):
raise NotImplementedError
lineStart, lineEnd = self._getLineOffsets(offset)
lineText = self._getTextRange(lineStart, lineEnd)
relOffset = offset - lineStart
if self.useUniscribe:
offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_CHARACTER, relOffset)
if offsets is not None:
return (offsets[0] + lineStart, offsets[1] + lineStart)
if self.encoding == textUtils.WCHAR_ENCODING:
lineStart,lineEnd=self._getLineOffsets(offset)
lineText=self._getTextRange(lineStart,lineEnd)
offsetConverter = textUtils.WideStringOffsetConverter(lineText)
relOffset = offset - lineStart
relStrStart, relStrEnd = offsetConverter.wideToStrOffsets(relOffset, relOffset + 1)
relWideStringStart, relWideStringEnd = offsetConverter.strToWideOffsets(relStrStart, relStrEnd)
return (relWideStringStart + lineStart, relWideStringEnd + lineStart)
elif self.encoding not in (None, "utf_32_le", locale.getlocale()[1]):
raise NotImplementedError
return offset, offset + 1
return (offset, offset + 1)

def _getWordOffsets(self,offset):
if self.encoding not in (textUtils.WCHAR_ENCODING, None, "utf_32_le", locale.getlocale()[1]):
Expand All @@ -323,33 +371,9 @@ def _getWordOffsets(self,offset):
lineText = lineText.translate({0:u' ',0xa0:u' '})
relOffset = offset - lineStart
if self.useUniscribe:
relStart=ctypes.c_int()
relEnd=ctypes.c_int()
# uniscribe does some strange things when you give it a string with not more than two alphanumeric chars in a row.
# Inject two alphanumeric characters at the end to fix this
uniscribeLineText = lineText + "xx"
# We can't rely on len(lineText) to calculate the length of the line.
if self.encoding != textUtils.WCHAR_ENCODING:
# We need to convert the str based line offsets to wide string offsets.
offsetConverter = textUtils.WideStringOffsetConverter(lineText)
lineLength = offsetConverter.wideStringLength
relOffset = offsetConverter.strToWideOffsets(relOffset, relOffset)[0]
else:
lineLength = (lineEnd - lineStart)
uniscribeLineLength = lineLength + 2
if NVDAHelper.localLib.calculateWordOffsets(
uniscribeLineText,
uniscribeLineLength,
relOffset,
ctypes.byref(relStart),
ctypes.byref(relEnd)
):
relStart = relStart.value
relEnd = min(lineLength, relEnd.value)
if self.encoding != textUtils.WCHAR_ENCODING:
# We need to convert the uniscribe based offsets to str offsets.
relStart, relEnd = offsetConverter.wideToStrOffsets(relStart, relEnd)
return (relStart + lineStart , relEnd + lineStart)
offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_WORD, relOffset)
if offsets is not None:
return (offsets[0] + lineStart, offsets[1] + lineStart)
#Fall back to the older word offsets detection that only breaks on non alphanumeric
if self.encoding == textUtils.WCHAR_ENCODING:
offsetConverter = textUtils.WideStringOffsetConverter(lineText)
Expand Down