nsu-ai · GorbachevaTaisia · May 11, 2022 · Jun 19, 2022
diff --git a/russian_g2p/Accentor.py b/russian_g2p/Accentor.py
@@ -11,9 +11,11 @@
 import dawg
 import logging
 
+from russian_g2p.ner_accentuation.NerAccentor import NerAccentor
+
 
 class Accentor:
-    def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki=True):
+    def __init__(self, mode='one', debug='no', exception_for_unknown=False):
         if debug == 'no':
             logging.basicConfig()
         else:
@@ -26,7 +28,7 @@ def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki
                                       'я'}
         self.__russian_vowels = {'а', 'о', 'у', 'э', 'ы', 'и', 'я', 'ё', 'ю', 'е'}
         self.exception_for_unknown = exception_for_unknown
-        self.use_wiki = use_wiki
+        #self.use_wiki = use_wiki
         self.__homonyms = None
         self.__simple_words_dawg = None
         self.__function_words = None
@@ -35,6 +37,7 @@ def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki
         self.__bad_words = []
         self.__re_for_morphosplit = re.compile(r'[\,\s\|]+', re.U)
         self.__re_for_morphotag = re.compile(r'^(\w+|\w+[\-\=]\w+)$', re.U)
+        self.__ner_acc = NerAccentor()
         assert mode in ('one', 'many'), 'Set either "one" or "many" variant mode!'
         assert debug in ('yes', 'no'), 'Set either "yes" or "no" variant mode!'
         homograph_dictionary_name = os.path.join(os.path.dirname(__file__), 'data', 'homographs.json')
@@ -93,174 +96,6 @@ def __del__(self):
         del self.__bad_words
         del self.__function_words
 
-    def get_correct_omograph_wiki(self, root_text, cur_word, morphotag='X'):
-        '''
-        Разбор омографии.
-        Использование морфологической информации о 
-        слове для их разграничения.
-        '''
-        langs = root_text.split('<hr />')
-        #print('hello?')
-        root = None
-        for lang in langs:
-            #print(lang)
-            head_pos = lang.find('<h2><span class="mw-headline" id="Russian">Russian</span>')
-            if head_pos != -1:
-                root = lxml.html.document_fromstring(lang[head_pos:])
-        if root == None:
-            #print(':^(')
-            return []
-        good_headers = []
-        shallow_vars = set()
-        results = set()
-        for header in root.findall('.//*[@class="mw-headline"]'):
-            #print(cur_word, morphotag)
-            if header.text_content() in ['Noun', 'Verb', 'Adjective', 'Adverb', 'Conjunction', 'Determiner', 'Interjection',
-    'Morpheme', 'Numeral', 'Particle', 'Predicative', 'Preposition', 'Pronoun']:
-                good_headers.append(header.text_content())
-                acc_word = header.getparent().getnext()
-                while acc_word.tag != 'p':
-                    acc_word = acc_word.getnext()
-                #print(acc_word)
-                result = []
-                hyphen = 0
-                for part in acc_word.find_class('Cyrl headword'):
-                    result += [part.text_content()]
-                    if part.text_content().find('-') != -1:
-                        hyphen = 1
-                #print(result)
-                if (hyphen == 1) or (len(result) == 1):
-                    result = ''.join(result)
-                else:
-                    continue
-                if result.replace('ё', 'е́').find('') != -1:
-                    shallow_vars.add(result)
-                if header.text_content()[0] == morphotag[0]:
-                    #print('The tags are equal')
-                    if header.text_content()[0] == 'N':
-                        gramm_info = acc_word.getnext()
-                        if gramm_info.text_content().find('of') != -1:
-                            for variant in gramm_info.find_class('form-of-definition'):
-                                info = variant.findall('a')
-                                #print(variant.text_content())
-                                try:
-                                    if info[0].text_content()[0] == 'p':
-                                        case = 'l'
-                                    else:
-                                        case = info[0].text_content()[0]
-                                    #print(case)
-                                    number = info[1].text_content()[0]
-                                    #print(number + case, morphotag)
-                                    if case == morphotag[morphotag.find('Case=') + 5].lower():
-                                        results.add(result)
-                                except IndexError:
-                                    continue
-                        else:
-                            if morphotag[morphotag.find('Case=') + 5].lower() == 'n':
-                                results.add(result)
-                    elif header.text_content()[0] == 'V':
-                        gramm_info = acc_word.getnext()
-                        if morphotag.find('Mood=Inf') != -1:
-                            results.add(result)
-                            #print('Wut',morphotag, results)
-                        for variant in gramm_info.find_class('form-of-definition'):
-                            #print(variant.text_content())
-                            t = 0
-                            if (variant.text_content().find('indicative') != -1) and (morphotag.find('Mood=Ind') != -1):                          
-                                if ((variant.text_content().find('future') != -1) or (variant.text_content().find('present') != -1)) and (morphotag.find('Tense=Notpast') != -1):
-                                    #print('I should be here')
-                                    results.add(result)
-                                    #print(1, results)
-                                elif (variant.text_content().find('past') != -1) and (morphotag.find('Tense=Past') != -1):
-                                    results.add(result)
-                                    #print(2, results)
-                            elif (variant.text_content().find('imperative') != -1) and (morphotag.find('Mood=Imp') != -1):
-                                results.add(result)
-                    else:
-                        results.add(result)
-            elif (header.text_content()[0] == 'D') and (morphotag.find('PRON') != -1):
-                acc_word = header.getparent().getnext()
-                result = ''
-                for part in acc_word.find_class('Cyrl headword'):
-                    result += part.text_content()
-                results.add(result)
-            elif (header.text_content().lower().find(morphotag.split()[0].lower()) != -1):
-                acc_word = header.getparent().getnext()
-                result = ''
-                for part in acc_word.find_class('Cyrl headword'):
-                    result += part.text_content()
-                results.add(result)
-        #print(shallow_vars)
-        if len(list(shallow_vars)) == 1:
-            if list(shallow_vars)[0].replace('ё', 'е+').replace('', '') == cur_word:
-                return [list(shallow_vars)[0].replace('ё', 'ё+').replace('', '+').replace('', '+')]
-        #print(results)
-        if len(list(results)) != 1:
-            return []
-        best_results = [variant.replace('', '+') for variant in results]
-        return list(best_results)
-
-    def get_simple_form_wiki(self, root_text, form):
-        '''
-        Непосредственное нахождение релевантной формы
-        и ударение без морфологической информации.
-        '''
-        root = lxml.html.document_fromstring(root_text)
-        rel_forms = set()
-        for header in root.findall('.//*[@class="Cyrl headword"][@lang="ru"]'):
-            header_text = header.text_content().replace('ё', 'е́')
-            header_text_best = header.text_content().replace('ё', 'ё+').replace('', '+')
-            if header_text.replace('', '') == form:
-                if header_text.find('') != -1:
-                    rel_forms.add(header_text_best)
-        for mention in root.findall('.//i[@class="Cyrl mention"][@lang="ru"]'):
-            mention_text = mention.text_content().replace('ё', 'е́')
-            mention_text_best = mention.text_content().replace('ё', 'ё+').replace('', '+')
-            if mention_text.replace('', '') == form:
-                if mention_text.replace('ё', 'е́').find('') != -1:
-                    rel_forms.add(mention_text_best)
-        for mention in root.findall('.//b[@class="Cyrl"][@lang="ru"]'):
-            mention_text = mention.text_content().replace('ё', 'е́')
-            mention_text_best = mention.text_content().replace('ё', 'ё+').replace('', '+')
-            if mention_text.replace('', '') == form:
-                if mention_text.replace('ё', 'е́').find('') != -1:
-                    rel_forms.add(mention_text_best)
-            elif mention_text.find('(') != -1:
-                if mention_text.replace('', '').find(form) != -1:
-                    if mention_text.find('') != -1:
-                        rel_forms.add(mention_text_best[mention_text.replace('', '').find(form):])
-                elif re.sub(r'[\(\)́]', '', mention_text) == form:
-                    rel_forms.add(re.sub(r'[\(\)]', '', mention_text_best))
-        for target in root.xpath('.//span[@class="Cyrl"][@lang="ru"]'):
-            one_form = target.text_content()
-            if one_form.replace('ё', 'е́').replace('', '') == form:
-                if one_form.replace('ё', 'е́').find('') != -1:
-                    rel_forms.add(one_form.replace('ё', 'ё́').replace('', '+'))
-        results = list(rel_forms)
-        if len(results) == 2:
-            if results[0].replace('ё', 'е') == results[1].replace('ё', 'е'):
-                rel_forms = set()
-                for var in results:
-                    if var.find('ё') != -1:
-                        rel_forms.add(var)
-        return list(rel_forms)
-
-    def load_wiki_page(self, cur_form):
-        if not self.use_wiki:
-            if self.exception_for_unknown:
-                raise ValueError(f'Word `{cur_form}` is unknown!')
-            return
-        query = urllib.parse.urlencode({ 'title' : cur_form })
-        try:
-            http_exception_type = urllib.error.HTTPError
-        except:
-            http_exception_type = urllib.request.HTTPError
-        try:
-            with urllib.request.urlopen(f'https://en.wiktionary.org/w/index.php?{query}&#printable=yes') as f:
-                root_text = f.read().decode('utf-8')
-                return root_text
-        except http_exception_type:
-            return
 
     def do_accents(self, source_phrase_and_morphotags: list) -> list:
         self.logger.debug('Checking the source phrase...')
@@ -484,31 +319,29 @@ def __do_accents(self, words_list: list, morphotags_list: list=None) -> list:
                             accented_wordforms += [self.__homonyms[cur_word][morpho_variants[best_ind]]]
                             accented_wordforms_many.append([self.__homonyms[cur_word][morpho_variants[best_ind]]])
                         else:
-                            root_text = self.load_wiki_page(cur_word)
-                            if root_text != None:
-                                #print('am I even here?')
-                                cur_accented_wordforms = sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
-                                if len(cur_accented_wordforms) == 1:
-                                    accented_wordforms += [cur_accented_wordforms[0]]
-                                    accented_wordforms_many.append([cur_accented_wordforms[0]])
-                                    self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
-                                elif len(cur_accented_wordforms) > 1:
-                                    accented_wordforms += [cur_word]
-                                    accented_wordforms_many.append([cur_accented_wordforms])
-                                    warn = 'many'
-                                else:
-                                    accented_wordforms += [cur_word]
-                                    accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
-                                    warn = 'many'
+                            #print('am I even here?')
+                            cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0])  # sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
+                            if len(cur_accented_wordforms) == 1:
+                                accented_wordforms += [cur_accented_wordforms[0]]
+                                accented_wordforms_many.append([cur_accented_wordforms[0]])
+                                self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
+                            elif len(cur_accented_wordforms) > 1:
+                                accented_wordforms += [cur_word]
+                                accented_wordforms_many.append([cur_accented_wordforms])
+                                warn = 'many'
                             else:
                                 accented_wordforms += [cur_word]
-                                accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
+                                accented_wordforms_many.append(cur_accented_wordforms)  # sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
                                 warn = 'many'
+
                 else:
-                    self.logger.debug(f'The word `{cur_word}` was not found in any of the dictionaries\nTrying to parse wictionary page...')
-                    root_text = self.load_wiki_page(cur_word)
-                    if root_text != None:
-                        cur_accented_wordforms = sorted(self.get_simple_form_wiki(root_text, cur_word))
+                    #self.logger.debug(f'The word `{cur_word}` was not found in any of the dictionaries\nTrying to parse wictionary page...')
+                    if morphotags_list is None:
+                        err_msg = f'Word `{cur_word}` has no morphotags. Try again by specifying it'
+                        raise ValueError(err_msg)
+                    else:
+                        cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0])  # sorted(self.get_simple_form_wiki(root_text, cur_word))
+                        #print(cur_accented_wordforms)
                         if len(cur_accented_wordforms) == 1:
                             accented_wordforms += [cur_accented_wordforms[0]]
                             accented_wordforms_many.append([cur_accented_wordforms[0]])
@@ -518,20 +351,16 @@ def __do_accents(self, words_list: list, morphotags_list: list=None) -> list:
                             accented_wordforms_many.append([cur_word])
                             warn = 'no'
                         else:
-                            cur_accented_wordforms = sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
-                            if len(cur_accented_wordforms) == 1:
-                                accented_wordforms += [cur_accented_wordforms[0]]
-                                accented_wordforms_many.append([cur_accented_wordforms[0]])
-                                self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
-                            else:
-                                accented_wordforms += [cur_word]
-                                accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
-                                warn = 'many'
+                            #cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0])#sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
+                            #if len(cur_accented_wordforms) == 1:
+                            #    accented_wordforms += [cur_accented_wordforms[0]]
+                            #    accented_wordforms_many.append([cur_accented_wordforms[0]])
+                            #    self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
+                            #else:
+                            accented_wordforms += [cur_word]
+                            accented_wordforms_many.append(cur_accented_wordforms)  # sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
+                            warn = 'many'
 
-                    else:
-                        accented_wordforms += [cur_word]
-                        accented_wordforms_many.append([cur_word])
-                        warn = 'no'
                 if i == 0:
                     if (accented_wordforms[0].find('+') != -1) or (len(separate_tokens) == 2):
                         break

diff --git a/russian_g2p/ner_accentuation/NerAccentor.py b/russian_g2p/ner_accentuation/NerAccentor.py
@@ -0,0 +1,30 @@
+import numpy as np
+from russian_g2p.ner_accentuation.ner_utils import load_place_stress_model, create_index_of_letters, create_morph_vector, create_index_of_con_vow
+
+
+class NerAccentor:
+  def __init__(self):
+    self._place_stress_model = load_place_stress_model()
+
+  def define_stress(self, word, morph_inf):
+    all_word_vectors = [create_index_of_letters(word), create_morph_vector(morph_inf), create_index_of_con_vow(word)]
+    stress_vector_two_list = np.asarray(self._place_stress_model.predict(all_word_vectors, verbose=1) >= 0.34,
+                                        dtype=np.int32)  # список в списке
+    stress_vector = stress_vector_two_list.reshape(stress_vector_two_list.shape[0], stress_vector_two_list.shape[1], )[
+        0]
+    stress_index = []
+    for i in range(len(stress_vector)):
+        if stress_vector[i] == 1:
+            stress_index.append(i)
+
+    word_with_stress = [letter for letter in word]
+    words_return = []
+    if len(stress_index) > 0:
+        for index in stress_index:
+            word_with_stress.insert(index + 1, '+')
+            words_return.append(''.join(word_with_stress))
+            word_with_stress.pop(index + 1)
+    else:
+        words_return.append(''.join(word_with_stress))
+
+    return words_return
diff --git a/russian_g2p/ner_accentuation/Placement_of_stress_best_model.h5 b/russian_g2p/ner_accentuation/Placement_of_stress_best_model.h5