Fix incorrect breaking of words at marks such as vowel signs and vira…

…ma in Indic languages. speech.speakTypedCharacters and textInfos.offsets.find{Start,End}OfWord were using unicode.isalnum to check for characters that are part of a word, but this only covers alphanumeric characters. The marks in question aren't alphanumeric, but should still be considered part of a word. Therefore, use the Unicode category of the character and include letters, marks and numbers. Fixes #4254.
nvaccess · Aug 5, 2014 · 7ffeb00 · 7ffeb00
1 parent 360d408
commit 7ffeb00
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 9 deletions.
diff --git a/source/speech.py b/source/speech.py
@@ -10,6 +10,7 @@
 
 import itertools
 import weakref
+import unicodedata
 import colors
 import globalVars
 from logHandler import log
@@ -539,7 +540,7 @@ def speakTypedCharacters(ch):
 		realChar="*"
 	else:
 		realChar=ch
-	if ch.isalnum():
+	if unicodedata.category(ch)[0] in "LMN":
 		curWordChars.append(realChar)
 	elif ch=="\b":
 		# Backspace, so remove the last character from our buffer.

diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py
@@ -6,6 +6,7 @@
 
 import re
 import ctypes
+import unicodedata
 import NVDAHelper
 import config
 import textInfos
@@ -61,7 +62,7 @@ def findStartOfLine(text,offset,lineLength=None):
 def findEndOfLine(text,offset,lineLength=None):
 	"""Searches forwards through the given text from the given offset, until it finds the offset that is the start of the next line. With out a set line length, it searches for new line / cariage return characters, with a set line length it simply moves forward to sit on a multiple of the line length.
 @param text: the text to search
-@type text: string
+@type text: unicode
 @param offset: the offset of the text to start at
 @type offset: int
 @param lineLength: The number of characters that makes up a line, None if new line characters should be looked at instead
@@ -88,7 +89,7 @@ def findEndOfLine(text,offset,lineLength=None):
 def findStartOfWord(text,offset,lineLength=None):
 	"""Searches backwards through the given text from the given offset, until it finds the offset that is the start of the word. It checks to see if a character is alphanumeric, or is another symbol , or is white space.
 @param text: the text to search
-@type text: string
+@type text: unicode
 @param offset: the offset of the text to start at
 @type offset: int
 @param lineLength: The number of characters that makes up a line, None if new line characters should be looked at instead
@@ -100,17 +101,17 @@ def findStartOfWord(text,offset,lineLength=None):
 		return offset
 	while offset>0 and text[offset].isspace():
 		offset-=1
-	if not text[offset].isalnum():
+	if unicodedata.category(text[offset])[0] not in "LMN":
 		return offset
 	else:
-		while offset>0 and text[offset-1].isalnum():
+		while offset>0 and unicodedata.category(text[offset-1])[0] in "LMN":
 			offset-=1
 	return offset
 
 def findEndOfWord(text,offset,lineLength=None):
 	"""Searches forwards through the given text from the given offset, until it finds the offset that is the start of the next word. It checks to see if a character is alphanumeric, or is another symbol , or is white space.
 @param text: the text to search
-@type text: string
+@type text: unicode
 @param offset: the offset of the text to start at
 @type offset: int
 @param lineLength: The number of characters that makes up a line, None if new line characters should be looked at instead
@@ -120,10 +121,10 @@ def findEndOfWord(text,offset,lineLength=None):
 """
 	if offset>=len(text):
 		return offset+1
-	if text[offset].isalnum():
-		while offset<len(text) and text[offset].isalnum():
+	if unicodedata.category(text[offset])[0] in "LMN":
+		while offset<len(text) and unicodedata.category(text[offset])[0] in "LMN":
 			offset+=1
-	elif not text[offset].isspace() and not text[offset].isalnum():
+	elif unicodedata.category(text[offset])[0] not in "LMNZ":
 		offset+=1
 	while offset<len(text) and text[offset].isspace():
 		offset+=1

diff --git a/user_docs/en/changes.t2t b/user_docs/en/changes.t2t
@@ -77,6 +77,7 @@
 - Microsoft Outlook's Signature dialog: the Signature editing field is now accessible, allowing for full cursor tracking and format detection. (#3833) 
 - Microsoft Word: When reading the last line of a table cell, the entire table cell is no longer read. (#3421)
 - Microsoft Word: When reading the first or last line of a table of contents, the entire table of contents is no longer read. (#3421)
+- When speaking typed words and in some other cases, words are no longer incorrectly broken at marks such as vowel signs and virama in Indic languages. (#4254)
 
 
 == Changes for Developers ==