Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python 3: When processing xml coming from virtual buffers, make sure that surrogate characters are handled properly #9897

Merged
merged 3 commits into from Jul 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions source/NVDAObjects/IAccessible/__init__.py
Expand Up @@ -15,7 +15,7 @@
import tones
import languageHandler
import textInfos.offsets
from textUtils import HIGH_SURROGATE_FIRST, HIGH_SURROGATE_LAST, LOW_SURROGATE_FIRST, LOW_SURROGATE_LAST
import textUtils
import colors
import time
import displayModel
Expand Down Expand Up @@ -261,7 +261,7 @@ def _getCharacterOffsets(self,offset):
start,end,text = self.obj.IAccessibleTextObject.TextAtOffset(offset,IAccessibleHandler.IA2_TEXT_BOUNDARY_CHAR)
except COMError:
return super(IA2TextTextInfo,self)._getCharacterOffsets(offset)
if HIGH_SURROGATE_FIRST <= text <= HIGH_SURROGATE_LAST or LOW_SURROGATE_FIRST <= text <= LOW_SURROGATE_LAST:
if textUtils.isHighSurrogate(text) or textUtils.isLowSurrogate(text):
# #8953: Some IA2 implementations, including Gecko and Chromium,
# erroneously report one offset for surrogates.
return super(IA2TextTextInfo,self)._getCharacterOffsets(offset)
Expand Down
15 changes: 12 additions & 3 deletions source/XMLFormatting.py
@@ -1,6 +1,13 @@
#XMLFormatting.py
#A part of NonVisual Desktop Access (NVDA)
#Copyright (C) 2008-2019 NV Access Limited, Babbage B.V.
#This file is covered by the GNU General Public License.
#See the file COPYING for more details.

from xml.parsers import expat
import textInfos
from logHandler import log
from textUtils import WCHAR_ENCODING, isLowSurrogate

class XMLTextParser(object):

Expand All @@ -19,7 +26,7 @@ def _startElementHandler(self,tagName,attrs):
data=chr(int(data))
except ValueError:
data=u'\ufffd'
self._CharacterDataHandler(data)
self._CharacterDataHandler(data, processBufferedSurrogates=isLowSurrogate(data))
return
elif tagName=='control':
newAttrs=textInfos.ControlField(attrs)
Expand Down Expand Up @@ -48,10 +55,12 @@ def _EndElementHandler(self,tagName):
else:
raise ValueError("unknown tag name: %s"%tagName)

def _CharacterDataHandler(self,data):
def _CharacterDataHandler(self,data, processBufferedSurrogates=False):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was the processBufferedSurrogates arg added?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I added this parameter without actually using it. Let me fix that.

cmdList=self._commandList
if cmdList and isinstance(cmdList[-1],str):
cmdList[-1]+=data
cmdList[-1] += data
if processBufferedSurrogates:
cmdList[-1] = cmdList[-1].encode(WCHAR_ENCODING, errors="surrogatepass").decode(WCHAR_ENCODING)
else:
cmdList.append(data)

Expand Down
22 changes: 16 additions & 6 deletions source/textUtils.py
Expand Up @@ -17,10 +17,6 @@
import locale
from logHandler import log

HIGH_SURROGATE_FIRST = u"\uD800"
HIGH_SURROGATE_LAST = u"\uDBFF"
LOW_SURROGATE_FIRST = u"\uDC00"
LOW_SURROGATE_LAST = u"\uDFFF"
WCHAR_ENCODING = "utf_16_le"

class WideStringOffsetConverter:
Expand Down Expand Up @@ -169,9 +165,9 @@ def wideToStrOffsets(
# They take one offset in the resulting string, so our offsets are off by one.
if (
precedingStr
and HIGH_SURROGATE_FIRST <= precedingStr[-1] <= HIGH_SURROGATE_LAST
and isHighSurrogate(precedingStr[-1])
and decodedRange
and LOW_SURROGATE_FIRST <= decodedRange[0] <= LOW_SURROGATE_LAST
and isLowSurrogate(decodedRange[0])
):
strStart -= 1
strEnd -= 1
Expand Down Expand Up @@ -225,3 +221,17 @@ def getTextFromRawBytes(
log.debugWarning("Error decoding text in %r, probably wrong encoding assumed or incomplete data" % buf)
text = rawText.decode(encoding, errors=errorsFallback)
return text

HIGH_SURROGATE_FIRST = u"\uD800"
HIGH_SURROGATE_LAST = u"\uDBFF"

def isHighSurrogate(ch: str) -> bool:
"""Returns if the given character is a high surrogate UTF-16 character."""
return HIGH_SURROGATE_FIRST <= ch <= HIGH_SURROGATE_LAST

LOW_SURROGATE_FIRST = u"\uDC00"
LOW_SURROGATE_LAST = u"\uDFFF"

def isLowSurrogate(ch: str) -> bool:
"""Returns if the given character is a low surrogate UTF-16 character."""
return LOW_SURROGATE_FIRST <= ch <= LOW_SURROGATE_LAST