In [None]:
import re
import xml.etree.ElementTree as ET

# Dictionary parsing

In [2]:
dict_path = r'stuff/dict.opcorpora.xml'

In [4]:
def parse_dict(xml_file_path):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    return root

In [5]:
dict_root = parse_dict(dict_path)

# Parsing lemmata from dictionary

In [6]:
from collections import defaultdict

In [7]:
def parse_lemmata(root):
    lemmata_dict = defaultdict(list)
    lemmata_section = root.find('lemmata')

    for lemma_elem in lemmata_section.findall('lemma'):
        l_elem = lemma_elem.find('l')
        lemma_text = l_elem.get('t', '')
        
        first_g_elem = l_elem.find('g')
        main_grammem = first_g_elem.get('v', '') if first_g_elem is not None else ''
                    
        for f_elem in lemma_elem.findall('f'):
            form_text = f_elem.get('t', '')
            if form_text: 
                new_entry = (lemma_text, main_grammem)
                
                if new_entry not in lemmata_dict[form_text]:
                    lemmata_dict[form_text].append(new_entry)
            
    return dict(lemmata_dict)

In [8]:
lemmata_dict = parse_lemmata(dict_root)

# Tokenize and lemmatize text

In [9]:
def tokenize(text):  
    tokens = re.findall(r'\w+', text)
    return [token for token in tokens if token]

def normalize(word):
    return word.lower()

In [10]:
def process_text(input_text, lemmata_dict, llm):
    lines = input_text.strip().split('\n')
    results = []
    
    for line in lines:
        tokens = tokenize(line)
        processed_tokens = []
        
        for token in tokens:
            normalized = normalize(token)
            
            if normalized in lemmata_dict:
                possible_lemmas = lemmata_dict[normalized]
                
                if len(possible_lemmas) == 1:
                    lemma, grammem = possible_lemmas[0]
                else:
                    print(f"Омонимия: {token} -> {possible_lemmas}")
                    lemma, grammem = llm.disambiguate(token, possible_lemmas, line)
                    
                processed_tokens.append(f"{token}{{{lemma}={grammem}}}")
            else:
                # Неизвестное слово
                print(f"Неизвестное слово: {token}")
                lemma, grammem = llm.guess_unknown_word(token, line)
                processed_tokens.append(f"{token}{{{lemma}={grammem}}}")
        
        results.append(' '.join(processed_tokens))
    
    return '\n'.join(results)


In [None]:
def process_text(input_text, lemmata_dict, llm):
    lines = input_text.strip().split('\n')
    results = []
    
    for line in lines:
        tokens = tokenize(line)
        processed_tokens = []
        
        for token in tokens:
            normalized = normalize(token)
            
            if normalized in lemmata_dict:
                possible_lemmas = lemmata_dict[normalized]
                
                if len(possible_lemmas) == 1:
                    lemma, grammem = possible_lemmas[0]
            else:
                print(f"Омонимия: {token} -> {possible_lemmas}")
                lemma, grammem = disambiguate(llm, token, possible_lemmas, line)

            processed_tokens.append(f"{token}{{{lemma}={grammem}}}")
        
        else:

            lemma, grammem = guess_unknown_word(token)
            processed_tokens.append(f"{token}{{{lemma}={grammem}}}")
        results.append(' '.join(processed_tokens))
    
    return '\n'.join(results)

In [11]:
text = '''люблю русскую печь и печь пироги'''

In [12]:
process_text(text, lemmata_dict, llm)

NameError: name 'lemmata_dict' is not defined