Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Добавление нейросети #38

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 33 additions & 204 deletions russian_g2p/Accentor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
import dawg
import logging

from russian_g2p.ner_accentuation.NerAccentor import NerAccentor


class Accentor:
def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki=True):
def __init__(self, mode='one', debug='no', exception_for_unknown=False):
if debug == 'no':
logging.basicConfig()
else:
Expand All @@ -26,7 +28,7 @@ def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki
'я'}
self.__russian_vowels = {'а', 'о', 'у', 'э', 'ы', 'и', 'я', 'ё', 'ю', 'е'}
self.exception_for_unknown = exception_for_unknown
self.use_wiki = use_wiki
#self.use_wiki = use_wiki
self.__homonyms = None
self.__simple_words_dawg = None
self.__function_words = None
Expand All @@ -35,6 +37,7 @@ def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki
self.__bad_words = []
self.__re_for_morphosplit = re.compile(r'[\,\s\|]+', re.U)
self.__re_for_morphotag = re.compile(r'^(\w+|\w+[\-\=]\w+)$', re.U)
self.__ner_acc = NerAccentor()
assert mode in ('one', 'many'), 'Set either "one" or "many" variant mode!'
assert debug in ('yes', 'no'), 'Set either "yes" or "no" variant mode!'
homograph_dictionary_name = os.path.join(os.path.dirname(__file__), 'data', 'homographs.json')
Expand Down Expand Up @@ -93,174 +96,6 @@ def __del__(self):
del self.__bad_words
del self.__function_words

def get_correct_omograph_wiki(self, root_text, cur_word, morphotag='X'):
'''
Разбор омографии.
Использование морфологической информации о
слове для их разграничения.
'''
langs = root_text.split('<hr />')
#print('hello?')
root = None
for lang in langs:
#print(lang)
head_pos = lang.find('<h2><span class="mw-headline" id="Russian">Russian</span>')
if head_pos != -1:
root = lxml.html.document_fromstring(lang[head_pos:])
if root == None:
#print(':^(')
return []
good_headers = []
shallow_vars = set()
results = set()
for header in root.findall('.//*[@class="mw-headline"]'):
#print(cur_word, morphotag)
if header.text_content() in ['Noun', 'Verb', 'Adjective', 'Adverb', 'Conjunction', 'Determiner', 'Interjection',
'Morpheme', 'Numeral', 'Particle', 'Predicative', 'Preposition', 'Pronoun']:
good_headers.append(header.text_content())
acc_word = header.getparent().getnext()
while acc_word.tag != 'p':
acc_word = acc_word.getnext()
#print(acc_word)
result = []
hyphen = 0
for part in acc_word.find_class('Cyrl headword'):
result += [part.text_content()]
if part.text_content().find('-') != -1:
hyphen = 1
#print(result)
if (hyphen == 1) or (len(result) == 1):
result = ''.join(result)
else:
continue
if result.replace('ё', 'е́').find('') != -1:
shallow_vars.add(result)
if header.text_content()[0] == morphotag[0]:
#print('The tags are equal')
if header.text_content()[0] == 'N':
gramm_info = acc_word.getnext()
if gramm_info.text_content().find('of') != -1:
for variant in gramm_info.find_class('form-of-definition'):
info = variant.findall('a')
#print(variant.text_content())
try:
if info[0].text_content()[0] == 'p':
case = 'l'
else:
case = info[0].text_content()[0]
#print(case)
number = info[1].text_content()[0]
#print(number + case, morphotag)
if case == morphotag[morphotag.find('Case=') + 5].lower():
results.add(result)
except IndexError:
continue
else:
if morphotag[morphotag.find('Case=') + 5].lower() == 'n':
results.add(result)
elif header.text_content()[0] == 'V':
gramm_info = acc_word.getnext()
if morphotag.find('Mood=Inf') != -1:
results.add(result)
#print('Wut',morphotag, results)
for variant in gramm_info.find_class('form-of-definition'):
#print(variant.text_content())
t = 0
if (variant.text_content().find('indicative') != -1) and (morphotag.find('Mood=Ind') != -1):
if ((variant.text_content().find('future') != -1) or (variant.text_content().find('present') != -1)) and (morphotag.find('Tense=Notpast') != -1):
#print('I should be here')
results.add(result)
#print(1, results)
elif (variant.text_content().find('past') != -1) and (morphotag.find('Tense=Past') != -1):
results.add(result)
#print(2, results)
elif (variant.text_content().find('imperative') != -1) and (morphotag.find('Mood=Imp') != -1):
results.add(result)
else:
results.add(result)
elif (header.text_content()[0] == 'D') and (morphotag.find('PRON') != -1):
acc_word = header.getparent().getnext()
result = ''
for part in acc_word.find_class('Cyrl headword'):
result += part.text_content()
results.add(result)
elif (header.text_content().lower().find(morphotag.split()[0].lower()) != -1):
acc_word = header.getparent().getnext()
result = ''
for part in acc_word.find_class('Cyrl headword'):
result += part.text_content()
results.add(result)
#print(shallow_vars)
if len(list(shallow_vars)) == 1:
if list(shallow_vars)[0].replace('ё', 'е+').replace('', '') == cur_word:
return [list(shallow_vars)[0].replace('ё', 'ё+').replace('', '+').replace('', '+')]
#print(results)
if len(list(results)) != 1:
return []
best_results = [variant.replace('', '+') for variant in results]
return list(best_results)

def get_simple_form_wiki(self, root_text, form):
'''
Непосредственное нахождение релевантной формы
и ударение без морфологической информации.
'''
root = lxml.html.document_fromstring(root_text)
rel_forms = set()
for header in root.findall('.//*[@class="Cyrl headword"][@lang="ru"]'):
header_text = header.text_content().replace('ё', 'е́')
header_text_best = header.text_content().replace('ё', 'ё+').replace('', '+')
if header_text.replace('', '') == form:
if header_text.find('') != -1:
rel_forms.add(header_text_best)
for mention in root.findall('.//i[@class="Cyrl mention"][@lang="ru"]'):
mention_text = mention.text_content().replace('ё', 'е́')
mention_text_best = mention.text_content().replace('ё', 'ё+').replace('', '+')
if mention_text.replace('', '') == form:
if mention_text.replace('ё', 'е́').find('') != -1:
rel_forms.add(mention_text_best)
for mention in root.findall('.//b[@class="Cyrl"][@lang="ru"]'):
mention_text = mention.text_content().replace('ё', 'е́')
mention_text_best = mention.text_content().replace('ё', 'ё+').replace('', '+')
if mention_text.replace('', '') == form:
if mention_text.replace('ё', 'е́').find('') != -1:
rel_forms.add(mention_text_best)
elif mention_text.find('(') != -1:
if mention_text.replace('', '').find(form) != -1:
if mention_text.find('') != -1:
rel_forms.add(mention_text_best[mention_text.replace('', '').find(form):])
elif re.sub(r'[\(\)́]', '', mention_text) == form:
rel_forms.add(re.sub(r'[\(\)]', '', mention_text_best))
for target in root.xpath('.//span[@class="Cyrl"][@lang="ru"]'):
one_form = target.text_content()
if one_form.replace('ё', 'е́').replace('', '') == form:
if one_form.replace('ё', 'е́').find('') != -1:
rel_forms.add(one_form.replace('ё', 'ё́').replace('', '+'))
results = list(rel_forms)
if len(results) == 2:
if results[0].replace('ё', 'е') == results[1].replace('ё', 'е'):
rel_forms = set()
for var in results:
if var.find('ё') != -1:
rel_forms.add(var)
return list(rel_forms)

def load_wiki_page(self, cur_form):
if not self.use_wiki:
if self.exception_for_unknown:
raise ValueError(f'Word `{cur_form}` is unknown!')
return
query = urllib.parse.urlencode({ 'title' : cur_form })
try:
http_exception_type = urllib.error.HTTPError
except:
http_exception_type = urllib.request.HTTPError
try:
with urllib.request.urlopen(f'https://en.wiktionary.org/w/index.php?{query}&#printable=yes') as f:
root_text = f.read().decode('utf-8')
return root_text
except http_exception_type:
return

def do_accents(self, source_phrase_and_morphotags: list) -> list:
self.logger.debug('Checking the source phrase...')
Expand Down Expand Up @@ -484,31 +319,29 @@ def __do_accents(self, words_list: list, morphotags_list: list=None) -> list:
accented_wordforms += [self.__homonyms[cur_word][morpho_variants[best_ind]]]
accented_wordforms_many.append([self.__homonyms[cur_word][morpho_variants[best_ind]]])
else:
root_text = self.load_wiki_page(cur_word)
if root_text != None:
#print('am I even here?')
cur_accented_wordforms = sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
if len(cur_accented_wordforms) == 1:
accented_wordforms += [cur_accented_wordforms[0]]
accented_wordforms_many.append([cur_accented_wordforms[0]])
self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
elif len(cur_accented_wordforms) > 1:
accented_wordforms += [cur_word]
accented_wordforms_many.append([cur_accented_wordforms])
warn = 'many'
else:
accented_wordforms += [cur_word]
accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
warn = 'many'
#print('am I even here?')
cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0]) # sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
if len(cur_accented_wordforms) == 1:
accented_wordforms += [cur_accented_wordforms[0]]
accented_wordforms_many.append([cur_accented_wordforms[0]])
self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
elif len(cur_accented_wordforms) > 1:
accented_wordforms += [cur_word]
accented_wordforms_many.append([cur_accented_wordforms])
warn = 'many'
else:
accented_wordforms += [cur_word]
accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
accented_wordforms_many.append(cur_accented_wordforms) # sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
warn = 'many'

else:
self.logger.debug(f'The word `{cur_word}` was not found in any of the dictionaries\nTrying to parse wictionary page...')
root_text = self.load_wiki_page(cur_word)
if root_text != None:
cur_accented_wordforms = sorted(self.get_simple_form_wiki(root_text, cur_word))
#self.logger.debug(f'The word `{cur_word}` was not found in any of the dictionaries\nTrying to parse wictionary page...')
if morphotags_list is None:
err_msg = f'Word `{cur_word}` has no morphotags. Try again by specifying it'
raise ValueError(err_msg)
else:
cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0]) # sorted(self.get_simple_form_wiki(root_text, cur_word))
#print(cur_accented_wordforms)
if len(cur_accented_wordforms) == 1:
accented_wordforms += [cur_accented_wordforms[0]]
accented_wordforms_many.append([cur_accented_wordforms[0]])
Expand All @@ -518,20 +351,16 @@ def __do_accents(self, words_list: list, morphotags_list: list=None) -> list:
accented_wordforms_many.append([cur_word])
warn = 'no'
else:
cur_accented_wordforms = sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
if len(cur_accented_wordforms) == 1:
accented_wordforms += [cur_accented_wordforms[0]]
accented_wordforms_many.append([cur_accented_wordforms[0]])
self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
else:
accented_wordforms += [cur_word]
accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
warn = 'many'
#cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0])#sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
#if len(cur_accented_wordforms) == 1:
# accented_wordforms += [cur_accented_wordforms[0]]
# accented_wordforms_many.append([cur_accented_wordforms[0]])
# self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
#else:
accented_wordforms += [cur_word]
accented_wordforms_many.append(cur_accented_wordforms) # sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
warn = 'many'

else:
accented_wordforms += [cur_word]
accented_wordforms_many.append([cur_word])
warn = 'no'
if i == 0:
if (accented_wordforms[0].find('+') != -1) or (len(separate_tokens) == 2):
break
Expand Down
30 changes: 30 additions & 0 deletions russian_g2p/ner_accentuation/NerAccentor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import numpy as np
from russian_g2p.ner_accentuation.ner_utils import load_place_stress_model, create_index_of_letters, create_morph_vector, create_index_of_con_vow


class NerAccentor:
def __init__(self):
self._place_stress_model = load_place_stress_model()

def define_stress(self, word, morph_inf):
all_word_vectors = [create_index_of_letters(word), create_morph_vector(morph_inf), create_index_of_con_vow(word)]
stress_vector_two_list = np.asarray(self._place_stress_model.predict(all_word_vectors, verbose=1) >= 0.34,
dtype=np.int32) # список в списке
stress_vector = stress_vector_two_list.reshape(stress_vector_two_list.shape[0], stress_vector_two_list.shape[1], )[
0]
stress_index = []
for i in range(len(stress_vector)):
if stress_vector[i] == 1:
stress_index.append(i)

word_with_stress = [letter for letter in word]
words_return = []
if len(stress_index) > 0:
for index in stress_index:
word_with_stress.insert(index + 1, '+')
words_return.append(''.join(word_with_stress))
word_with_stress.pop(index + 1)
else:
words_return.append(''.join(word_with_stress))

return words_return
Binary file not shown.
Loading