Skip to content

Commit

Permalink
Fix incorrect breaking of words at marks such as vowel signs and vira…
Browse files Browse the repository at this point in the history
…ma in Indic languages.

speech.speakTypedCharacters and textInfos.offsets.find{Start,End}OfWord were using unicode.isalnum to check for characters that are part of a word, but this only covers alphanumeric characters. The marks in question aren't alphanumeric, but should still be considered part of a word.
Therefore, use the Unicode category of the character and include letters, marks and numbers.
Fixes #4254.
  • Loading branch information
jcsteh committed Aug 5, 2014
1 parent 360d408 commit 7ffeb00
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 9 deletions.
3 changes: 2 additions & 1 deletion source/speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import itertools
import weakref
import unicodedata
import colors
import globalVars
from logHandler import log
Expand Down Expand Up @@ -539,7 +540,7 @@ def speakTypedCharacters(ch):
realChar="*"
else:
realChar=ch
if ch.isalnum():
if unicodedata.category(ch)[0] in "LMN":
curWordChars.append(realChar)
elif ch=="\b":
# Backspace, so remove the last character from our buffer.
Expand Down
17 changes: 9 additions & 8 deletions source/textInfos/offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import re
import ctypes
import unicodedata
import NVDAHelper
import config
import textInfos
Expand Down Expand Up @@ -61,7 +62,7 @@ def findStartOfLine(text,offset,lineLength=None):
def findEndOfLine(text,offset,lineLength=None):
"""Searches forwards through the given text from the given offset, until it finds the offset that is the start of the next line. With out a set line length, it searches for new line / cariage return characters, with a set line length it simply moves forward to sit on a multiple of the line length.
@param text: the text to search
@type text: string
@type text: unicode
@param offset: the offset of the text to start at
@type offset: int
@param lineLength: The number of characters that makes up a line, None if new line characters should be looked at instead
Expand All @@ -88,7 +89,7 @@ def findEndOfLine(text,offset,lineLength=None):
def findStartOfWord(text,offset,lineLength=None):
"""Searches backwards through the given text from the given offset, until it finds the offset that is the start of the word. It checks to see if a character is alphanumeric, or is another symbol , or is white space.
@param text: the text to search
@type text: string
@type text: unicode
@param offset: the offset of the text to start at
@type offset: int
@param lineLength: The number of characters that makes up a line, None if new line characters should be looked at instead
Expand All @@ -100,17 +101,17 @@ def findStartOfWord(text,offset,lineLength=None):
return offset
while offset>0 and text[offset].isspace():
offset-=1
if not text[offset].isalnum():
if unicodedata.category(text[offset])[0] not in "LMN":
return offset
else:
while offset>0 and text[offset-1].isalnum():
while offset>0 and unicodedata.category(text[offset-1])[0] in "LMN":
offset-=1
return offset

def findEndOfWord(text,offset,lineLength=None):
"""Searches forwards through the given text from the given offset, until it finds the offset that is the start of the next word. It checks to see if a character is alphanumeric, or is another symbol , or is white space.
@param text: the text to search
@type text: string
@type text: unicode
@param offset: the offset of the text to start at
@type offset: int
@param lineLength: The number of characters that makes up a line, None if new line characters should be looked at instead
Expand All @@ -120,10 +121,10 @@ def findEndOfWord(text,offset,lineLength=None):
"""
if offset>=len(text):
return offset+1
if text[offset].isalnum():
while offset<len(text) and text[offset].isalnum():
if unicodedata.category(text[offset])[0] in "LMN":
while offset<len(text) and unicodedata.category(text[offset])[0] in "LMN":
offset+=1
elif not text[offset].isspace() and not text[offset].isalnum():
elif unicodedata.category(text[offset])[0] not in "LMNZ":
offset+=1
while offset<len(text) and text[offset].isspace():
offset+=1
Expand Down
1 change: 1 addition & 0 deletions user_docs/en/changes.t2t
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
- Microsoft Outlook's Signature dialog: the Signature editing field is now accessible, allowing for full cursor tracking and format detection. (#3833)
- Microsoft Word: When reading the last line of a table cell, the entire table cell is no longer read. (#3421)
- Microsoft Word: When reading the first or last line of a table of contents, the entire table of contents is no longer read. (#3421)
- When speaking typed words and in some other cases, words are no longer incorrectly broken at marks such as vowel signs and virama in Indic languages. (#4254)


== Changes for Developers ==
Expand Down

0 comments on commit 7ffeb00

Please sign in to comment.