nvaccess · dineshkaushal · Aug 14, 2017 · Aug 15, 2017 · Aug 15, 2017 · Aug 15, 2017
@@ -19,6 +19,7 @@
 # maintains list of priority languages as a list of languageID, ScriptName, and LanguageDescription
 languagePriorityListSpec = []
 
+"""scriptIDToLangID is reverse of langIDToScriptID and is used to obtain language of the current script. language of a script is used to detect whether a chunk should be broken for languages that use multiple scripts."""
 scriptIDToLangID = {}
 
 LanguageDescription = namedtuple("LanguageDescription" , "languageID description")

@@ -10,6 +10,8 @@
 These entries should not overlap, but there could be gaps
 """
 
+import bisect
+
 # unicode digit constants
 DIGIT_ZERO = 0x30
 DIGIT_NINE = 0x39
@@ -885,28 +887,49 @@
 	( 0Xe0020 , 0Xe007f , "Common" ), 
 ]
 
+
+unicodeScriptRangeEnd = [ k[1] for k in scriptRanges]
+
 def getScriptCode(chr):
 	"""performs a binary search in scripCodes for unicode ranges
 	@param chr: character for which a script should be found
 	@type chr: string
 	@return: script code
 	@rtype: int"""
-	mStart = 0
-	mEnd = len(scriptRanges)-1
 	characterUnicodeCode = ord(chr)
 	# Number should respect preferred language setting
 	# FullWidthNumber is in Common category, however, it indicates Japanese language context
 	if DIGIT_ZERO  <= characterUnicodeCode <= DIGIT_NINE:
 		return "Number"
 	elif FULLWIDTH_ZERO <= characterUnicodeCode <= FULLWIDTH_NINE: 
 		return "FullWidthNumber"
-	while( mEnd >= mStart ):
-		midPoint = (mStart + mEnd ) >> 1
-		if characterUnicodeCode < scriptRanges[midPoint][0]: 
-			mEnd = midPoint -1
-		elif characterUnicodeCode > scriptRanges[midPoint][1]: 
-			mStart = midPoint + 1
-		else:
-			return scriptRanges[midPoint][2] 
-	return None
 
+	# Based on the following assumptions: 
+	# - ranges must not overlap
+	# - range end and start values are included in that range
+	# - there may be gaps between ranges.
+
+	# Approach: Look for the first index of a range where the range end value is greater
+	# than the code we are searching for. If this is found, and the start value for this range
+	# is less than or equal to the code we are searching for then we have found the range.
+	# That is startValue <= characterUnicodeCode <= endValue
+
+	index = bisect.bisect_left(unicodeScriptRangeEnd, characterUnicodeCode )
+	if index == len(unicodeScriptRangeEnd):
+		# there is no value of index such that: `characterUnicodeCode <= scriptCode[index][1]`
+		# characterUnicodeCode is larger than all of the range end values so a range is not 
+		# found for the value:
+		return None
+
+	# Since the range at index is the first where `characterUnicodeCode <= rangeEnd` is True,
+	# we now ensure that for the range at the index `characterUnicodeCode >= rangeStart` 
+	# is also True. 
+	candidateRange = scriptRanges[index]
+	rangeStart = candidateRange[0]
+	if rangeStart > characterUnicodeCode :
+		# characterUnicodeCode comes before the start of the range at index so a range 
+		# is not found for the value
+		return None
+
+	rangeName = candidateRange[2]
+	return rangeName
@@ -9,6 +9,7 @@
 import unittest
 import languageDetection
 from speech import LangChangeCommand
+from unicodeScriptData import scriptRanges
 import config
 
 class TestLanguageDetection(unittest.TestCase):
@@ -271,3 +272,19 @@ def test_englishWithGreekTextWithEnglishAsDefaultAndPreferedLanguageAsHindi(self
 		languageDetection.updateLanguagePriorityFromConfig()
 		self.compareSpeechSequence(detectedLanguageSequence  , testSequence) 
 
+	def test_unicodeRangesEntryStartLessEqualEnd(self):
+		for scriptRangeStart, scriptRangeEnd, scriptName in scriptRanges:
+			self.assertTrue(scriptRangeStart <= scriptRangeEnd)
+
+	def test_unicodeRangesEntriesDoNotOverlapAndAreSorted(self):
+		for index in xrange( len(scriptRanges) -1): 
+			#check is there is no overlap
+			currentRange = scriptRanges[index]
+			nextRange = scriptRanges[index+1]
+			currentRangeEnd = currentRange[1]
+			nextRangeStart = nextRange[0]
+			self.assertTrue(currentRangeEnd   < nextRangeStart)
+
+	def test_unicodeRangesEntryScriptNamesExist(self):
+		for scriptRangeStart, scriptRangeEnd, scriptName in scriptRanges:
+			self.assertTrue(scriptName)