In [1]:
import string,re

In [2]:
import codecs,itertools

In [3]:
def offset_to_char(c):
    return chr(c+0x0900)

def is_consonant(c):
    o=ord(c)-0x900
    return (o>=0x15 and o<=0x39)

In [4]:
class NormalizerI(object):

    BYTE_ORDER_MARK='\uFEFF'
    BYTE_ORDER_MARK_2='\uFFFE'
    WORD_JOINER='\u2060'
    SOFT_HYPHEN='\u00AD'

    ZERO_WIDTH_SPACE='\u200B'
    NO_BREAK_SPACE='\u00A0'

    ZERO_WIDTH_NON_JOINER='\u200C'
    ZERO_WIDTH_JOINER='\u200D'

    def _normalize_punctuations(self, text):

        text=text.replace(NormalizerI.BYTE_ORDER_MARK,'')
        text=text.replace('„', r'"')
        text=text.replace('“', r'"')
        text=text.replace('”', r'"')
        text=text.replace('–', r'-')
        text=text.replace('—', r' - ')
        text=text.replace('´', r"'")
        text=text.replace('‘', r"'")
        text=text.replace('‚', r"'")
        text=text.replace('’', r"'")
        text=text.replace("''", r'"')
        text=text.replace('´´', r'"')
        text=text.replace('…', r'...')

        return text


    def normalize(self,text):
        pass 

In [5]:
class BaseNormalizer(NormalizerI):

    def __init__(self,
            remove_nuktas=False,
            nasals_mode='do_nothing',
            do_normalize_chandras=False,
            do_normalize_vowel_ending=False):

        self.remove_nuktas=remove_nuktas
        self.nasals_mode=nasals_mode
        self.do_normalize_chandras=do_normalize_chandras
        self.do_normalize_vowel_ending=do_normalize_vowel_ending

        self._init_normalize_chandras()
        self._init_normalize_nasals()
        self._init_normalize_vowel_ending()
        #self._init_visarga_correction()
        
    def _init_normalize_vowel_ending(self):
        self.fn_vowel_ending=self._normalize_word_vowel_ending_ie
    

    def _init_normalize_chandras(self):

        substitution_offsets =\
            [
                [0x0d , 0x0f],
                [0x11 , 0x13], 
                [0x45 , 0x47], 
                [0x49 , 0x4b], 
                # [0x72 , 0x0f],

                [0x00 , 0x02],
                [0x01 , 0x02], 
            ]

        self.chandra_substitutions =  [ 
                (offset_to_char(x[0],), offset_to_char(x[1],)) 
                    for x in substitution_offsets ]

    def _normalize_chandras(self,text):
        for match, repl in self.chandra_substitutions:
            text=text.replace(match,repl)
        return text

    def _init_to_anusvaara_strict(self):
    
        pat_signatures=\
            [
                 [0x19,0x15,0x18],
                 [0x1e,0x1a,0x1d],            
                 [0x23,0x1f,0x22],                        
                 [0x28,0x24,0x27],        
                 [0x29,0x24,0x27],                    
                 [0x2e,0x2a,0x2d],                    
            ]    
        
        halant_offset=0x4d
        anusvaara_offset=0x02
        
        pats=[]
        
        for pat_signature in pat_signatures:
            pat=re.compile(r'{nasal}{halant}([{start_r}-{end_r}])'.format(
                nasal=offset_to_char(pat_signature[0],),
                halant=offset_to_char(halant_offset,),
                start_r=offset_to_char(pat_signature[1],),
                end_r=offset_to_char(pat_signature[2],),
            ))
            pats.append(pat)
        
        repl_string='{anusvaara}\\1'.format(anusvaara=offset_to_char(anusvaara_offset,))

        self.pats_repls=(pats,repl_string)
    
    def _to_anusvaara_strict(self,text):
        
        pats, repl_string = self.pats_repls
        for pat in pats:
            text=pat.sub(repl_string,text)
            
        return text

    def _init_to_anusvaara_relaxed(self):
            
        nasals_list=[0x19,0x1e,0x23,0x28,0x29,0x2e]    
        nasals_list_str=','.join([offset_to_char(x,) for x in nasals_list])
        
        halant_offset=0x4d    
        anusvaara_offset=0x02    
        
        pat=re.compile(r'[{nasals_list_str}]{halant}'.format(
                nasals_list_str=nasals_list_str,
                halant=offset_to_char(halant_offset,),
            ))
        
        repl_string='{anusvaara}'.format(anusvaara=offset_to_char(anusvaara_offset,))

        self.pats_repls = (pat,repl_string)
    
    def _to_anusvaara_relaxed(self,text):
        pat, repl_string = self.pats_repls
        return pat.sub(repl_string,text)
    

    def _init_to_nasal_consonants(self):

        pat_signatures=\
            [
                 [0x19,0x15,0x18],
                 [0x1e,0x1a,0x1d],            
                 [0x23,0x1f,0x22],                        
                 [0x28,0x24,0x27],        
                 [0x29,0x24,0x27],                    
                 [0x2e,0x2a,0x2d],                    
            ]    
        
        halant_offset=0x4d
        anusvaara_offset=0x02 
        
        pats=[]
        repl_strings=[]
        
        for pat_signature in pat_signatures:
            pat=re.compile(r'{anusvaara}([{start_r}-{end_r}])'.format(
                anusvaara=offset_to_char(anusvaara_offset,),
                start_r=offset_to_char(pat_signature[1],),
                end_r=offset_to_char(pat_signature[2],),
            ))
            pats.append(pat)
            repl_string='{nasal}{halant}\\1'.format(
                nasal=offset_to_char(pat_signature[0],),
                halant=offset_to_char(halant_offset,),
                )
            repl_strings.append(repl_string)
    
        self.pats_repls=list(zip(pats,repl_strings))

    def _to_nasal_consonants(self,text):
    
        for pat, repl in self.pats_repls:
            text=pat.sub(repl,text)
            
        return text

    def _init_normalize_nasals(self):

        if self.nasals_mode == 'to_anusvaara_strict':
            self._init_to_anusvaara_strict()
        elif self.nasals_mode == 'to_anusvaara_relaxed':
            self._init_to_anusvaara_relaxed()
        elif self.nasals_mode == 'to_nasal_consonants':
            self._init_to_nasal_consonants()

    def _normalize_nasals(self,text): 
        if self.nasals_mode == 'to_anusvaara_strict':
            return self._to_anusvaara_strict(text)
        elif self.nasals_mode == 'to_anusvaara_relaxed':
            return self._to_anusvaara_relaxed(text)
        elif self.nasals_mode == 'to_nasal_consonants':
            return self._to_nasal_consonants(text)
        else:
            return text

    
    def _normalize_word_vowel_ending_dravidian(self,word):

        if len(word)>0 and is_consonant(word[-1],):
            return word+offset_to_char(0x3e,)
        else:
            return word

    def _normalize_word_vowel_ending_ie(self,word):

        if len(word)>0 and is_consonant(word[-1],):
            return word+offset_to_char(0x4d,)
        else:
            return word 

    def _normalize_vowel_ending(self,text):
        return ' '.join([ self.fn_vowel_ending(w) for w in text.split(' ') ])

    def normalize(self,text):
        text=text.replace(NormalizerI.BYTE_ORDER_MARK,'')
        text=text.replace(NormalizerI.BYTE_ORDER_MARK_2,'')
        text=text.replace(NormalizerI.WORD_JOINER,'')
        text=text.replace(NormalizerI.SOFT_HYPHEN,'')

        text=text.replace(NormalizerI.ZERO_WIDTH_SPACE,' ') 
        text=text.replace(NormalizerI.NO_BREAK_SPACE,' ')

        text=text.replace(NormalizerI.ZERO_WIDTH_NON_JOINER, '')
        text=text.replace(NormalizerI.ZERO_WIDTH_JOINER,'')
        
        text=self._normalize_punctuations(text)

        if self.do_normalize_chandras:
            text=self._normalize_chandras(text)
        text=self._normalize_nasals(text)
        if self.do_normalize_vowel_ending:
            text=self._normalize_vowel_ending(text)
        
        return text

    def get_char_stats(self,text):    
        print(len(re.findall(NormalizerI.BYTE_ORDER_MARK,text)))
        print(len(re.findall(NormalizerI.BYTE_ORDER_MARK_2,text)))
        print(len(re.findall(NormalizerI.WORD_JOINER,text)))
        print(len(re.findall(NormalizerI.SOFT_HYPHEN,text)))

        print(len(re.findall(NormalizerI.ZERO_WIDTH_SPACE,text) ))
        print(len(re.findall(NormalizerI.NO_BREAK_SPACE,text)))

        print(len(re.findall(NormalizerI.ZERO_WIDTH_NON_JOINER,text)))
        print(len(re.findall(NormalizerI.ZERO_WIDTH_JOINER,text)))

    def correct_visarga(self,text,visarga_char,char_range):
        text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text)

In [6]:
text = 'मुझे बड़े बूबी पसंद हैं । लेकिन बड़ा लिंग परेशान कर रहा है ।'

In [7]:
normallizer = BaseNormalizer(nasals_mode='to_anusvaara_strict',do_normalize_chandras=True,do_normalize_vowel_ending=True)

In [8]:
s=normallizer.normalize(text)

In [9]:
print(s)

मुझे बड़े बूबी पसंद् हैं । लेकिन् बड़ा लिंग् परेशान् कर् रहा है ।
