In [1]:
from itertools import product
import warnings

In [2]:
import sys
import os
sys.path.append("../") # go to parent dir
from src.Derivation import Derivation
derivator = Derivation(use_guesser=True)

In [3]:
from tqdm.notebook import tqdm

In [4]:
from src.FinateStateMachine import FSM

In [5]:
wordlist = dict()
derivator.pos_all.append('part')
for pos in derivator.pos_all:
    wordlist_pos = list()
    with open(f'../data/wiktionary/v001.00/{pos}.txt', encoding='utf8') as f:
        for l in f:
            wordlist_pos.append(l.strip())
    wordlist[pos] = wordlist_pos

In [6]:
from multiprocessing import Pool, TimeoutError
import os

In [11]:
warnings.simplefilter("ignore")

In [12]:
class Part:
    def __init__(self, name, pos_b, pos_a = None, rule_id = None, wordlist = []):
        self.name = name
        self.pos_b, self.pos_a = pos_b, pos_a or pos_b
        self.rule_id = rule_id or name
        self.fsm = FSM({self.rule_id})
        print('FSM', self.rule_id)
        if rule_id:        
            with Pool(30) as p:
                results = list(tqdm(p.imap(self.get_derived, wordlist), total=len(wordlist)))
                print(len(results))
                for result_, word in zip(results, wordlist):
                    for result in result_:
                        self.fsm.add_word(list(result.lower()) + [self.rule_id, (word, self.pos_b)])
        else:
            for word in tqdm(wordlist):
                self.fsm.add_word(list(word.lower()) + [self.rule_id, (word, self.pos_b)])
    
    def get_derived(self, word):
        derived = derivator.derive(word_b=word.lower(), pos_b=self.pos_b, rule_id=self.rule_id, use_rare=True)
        if derived:
            return derived[self.rule_id]
        return []
        
    def add_word(self, form, lemma=None, pos=None):
        self.fsm.add_word(list(form.lower()) + [self.rule_id, (lemma or form, pos or self.pos_b)])
    
    def analyze_word(self, word):
        return self.fsm.analyze_word(word.lower())

In [13]:
#prt = Part('rule619*(noun + н1(ый) -> adj)', 'noun', 'adj',  'rule619*(noun + н1(ый) -> adj)', wordlist['noun'])

In [14]:
mod_rare = dict()
head_rare = dict()

for pos in derivator.pos_all:
    mod_rare[pos] = []
    head_rare[pos] = []    
    try:
        with open(f'../src/rules/compounds_rare_{pos}.csv', encoding='utf8') as f:
            for line in f.readlines()[1:]:
                lemma, pos_lemma, form, i_arg = line.strip().split(';')    
                if i_arg == '0':
                    head_rare[pos].append((lemma, pos_lemma, form))
                else:
                    mod_rare[pos].append((lemma, pos_lemma, form))
    except FileNotFoundError:
        pass

try:
    with open(f'../src/rules/compounds_rare_star.csv', encoding='utf8') as f:
        for line in f.readlines()[1:]:
            lemma, pos_lemma, form, rule_id, i_arg = line.strip().split(';')    
            if rule_id not in head_rare:
                head_rare[rule_id] = []
            if rule_id not in mod_rare:
                mod_rare[rule_id] = []
            if i_arg == '0':
                head_rare[rule_id].append((lemma, pos_lemma, form))
            else:
                mod_rare[rule_id].append((lemma, pos_lemma, form))
except FileNotFoundError:
    pass


In [15]:
head_parts = dict()
mod_parts = dict()

for rule in derivator.rules_compound:
    if rule.after_merge_rule_ids:
        # not implemented yet
        continue
    head_rules, mod_rules = rule.simple_rule_ids[0], rule.simple_rule_ids[1]
    if head_rules:
        # e.g. suffix
        head_rule = derivator.rules_dict[head_rules[0]]
        if head_rule.name not in head_parts:
            head_parts[head_rule.name] = Part(head_rule.name, head_rule.pos_b, head_rule.pos_a, head_rule.name, wordlist[head_rule.pos_b])
    else:
        # no changes
        if rule.pos_b not in head_parts:
            head_parts[rule.pos_b] = Part(rule.pos_b, rule.pos_b, rule.pos_b, None, wordlist[rule.pos_b])
            for lemma, pos, form in head_rare[rule.pos_b]:
                head_parts[rule.pos_b].add_word(form, lemma, pos)
    
    if mod_rules:
        # interfix
        mod_rule = derivator.rules_dict[mod_rules[0]]
        if mod_rule.name not in mod_parts:
            mod_parts[mod_rule.name] = Part(mod_rule.name, mod_rule.pos_b, mod_rule.pos_a, mod_rule.name, wordlist[mod_rule.pos_b])
    else:
        # no changes or star
        if rule.poss_m[0]  == '*':
            # star
            if rule.name not in mod_parts:
                mod_parts[rule.name] = Part(rule.name, '*', '*', None, [])
                for lemma, pos, form in mod_rare[rule.name]:
                    mod_parts[rule.name].add_word(form, lemma, pos)
        else:
            # adv, noun, etc.
            pos_m = rule.poss_m[0]
            if pos_m not in mod_parts:
                mod_parts[pos_m] = Part(pos_m, pos_m, pos_m, None, wordlist[pos_m])
                for lemma, pos, form in mod_rare[pos_m]:
                    mod_parts[pos_m].add_word(form, lemma, pos)

FSM noun


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


FSM ruleINTERFIX(noun)


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


164085
FSM ruleINTERFIX(adj)


HBox(children=(FloatProgress(value=0.0, max=47760.0), HTML(value='')))


47760
FSM ruleINTERFIX(num)


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))


39
FSM rule1028*(noun + GEN -> noun)


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


164085
FSM rule552([пол] + noun + GEN -> noun)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


FSM rule211(verb + тель -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule216(verb + ец/нец/енец/омец -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule213(verb + щик -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule214(verb + льщик -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule212(verb + ник/еник/енник/ик -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule262(verb + к(а)/овк(а)/ёжк(а) -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule228(verb + лк(а) -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule267(verb + н(я)/отн(я)/овн(я) -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule264(verb + ств(о) -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule256(verb + ниj(е) -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


164085
FSM rule337(noun + ан-ин/чан-ин -> noun)


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


164085
FSM rule578*(noun + иj(а) -> noun)


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


164085
FSM rule446(verb + 0m2 -> noun)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule580(noun + 0 -> noun)


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


164085
FSM rule619*(noun + н1(ый) -> adj)


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


164085
FSM rule623(noun + н(ий) -> adj)


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


164085
FSM rule628(noun + ов(ый) -> adj)


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


164085
FSM rule630(noun + ск(ий) -> adj)


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




45465
FSM rule664*(verb + енн(ый) -> adj)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM adj


HBox(children=(FloatProgress(value=0.0, max=47760.0), HTML(value='')))


FSM part


HBox(children=(FloatProgress(value=0.0, max=63641.0), HTML(value='')))


FSM rule752(noun + 0(ый) -> adj)


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


164085
FSM rule681(verb + 0(ий) -> adj)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM adv


HBox(children=(FloatProgress(value=0.0, max=8051.0), HTML(value='')))


FSM verb


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


FSM rule961([полу/само] + verb -> verb)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


FSM rule962(noun + [фицировать] -> verb)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


FSM noun


HBox(children=(FloatProgress(value=0.0, max=164085.0), HTML(value='')))


FSM adv


HBox(children=(FloatProgress(value=0.0, max=8051.0), HTML(value='')))


FSM rule1026([полу] + adv -> adv)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


FSM rule986(verb + ом/ком -> adv)


HBox(children=(FloatProgress(value=0.0, max=45465.0), HTML(value='')))


45465
FSM rule1027([мимо/само] + verb + ом -> adv)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [17]:
head_parts

{'adj': <__main__.Part at 0x2aac7300a3c8>,
 'adv': <__main__.Part at 0x2aaeaee0a9e8>,
 'noun': <__main__.Part at 0x2aaabfa32e80>,
 'part': <__main__.Part at 0x2aaaec7a10f0>,
 'rule1028*(noun + GEN -> noun)': <__main__.Part at 0x2aab001de4e0>,
 'rule211(verb + тель -> noun)': <__main__.Part at 0x2aab001ec668>,
 'rule212(verb + ник/еник/енник/ик -> noun)': <__main__.Part at 0x2aaabf782cf8>,
 'rule213(verb + щик -> noun)': <__main__.Part at 0x2aab001ecb70>,
 'rule214(verb + льщик -> noun)': <__main__.Part at 0x2aab001ecb00>,
 'rule216(verb + ец/нец/енец/омец -> noun)': <__main__.Part at 0x2aaabfa2e080>,
 'rule228(verb + лк(а) -> noun)': <__main__.Part at 0x2aab001de518>,
 'rule256(verb + ниj(е) -> noun)': <__main__.Part at 0x2aab001de400>,
 'rule262(verb + к(а)/овк(а)/ёжк(а) -> noun)': <__main__.Part at 0x2aab1ad4ab38>,
 'rule264(verb + ств(о) -> noun)': <__main__.Part at 0x2aaaec7816d8>,
 'rule267(verb + н(я)/отн(я)/овн(я) -> noun)': <__main__.Part at 0x2aaaec792ef0>,
 'rule329(noun + ни

In [18]:
analyzed = []

for rule in tqdm(derivator.rules_compound):
    if rule.after_merge_rule_ids:
        # not implemented yet
        continue
    head_rules, mod_rules = rule.simple_rule_ids[0], rule.simple_rule_ids[1]
    print(rule.name, head_rules, mod_rules)
    if head_rules:
        # e.g. suffix
        head_rule = derivator.rules_dict[head_rules[0]]
        head_part = head_parts[head_rule.name]
    else:
        # no changes
        head_part = head_parts[rule.pos_b]
    if mod_rules:
        # interfix
        mod_rule = derivator.rules_dict[mod_rules[0]]
        mod_part = mod_parts[mod_rule.name]
    else:
        # no changes or star
        if rule.poss_m[0]  == '*':
            # star
            mod_part = mod_parts[rule.name]
        else:
            # adv, noun, etc.
            pos_m = rule.poss_m[0]
            mod_part = mod_parts[pos_m]
    
    print(head_part, mod_part, rule.order)
    if rule.order == [0, 1]:
        ca = CompoundAnalyzer(rule.name, rule.pos_a, head_part, mod_part)
    else:
        # [1, 0]
        ca = CompoundAnalyzer(rule.name, rule.pos_a, mod_part, head_part)
    for word in wordlist[rule.pos_a]:
        analyzed.extend(ca.analyze(word, rule.pos_a))

HBox(children=(FloatProgress(value=0.0, max=68.0), HTML(value='')))

rule550([noun + ITFX] + noun -> noun) [] ['ruleINTERFIX(noun)']
<__main__.Part object at 0x2aaabfa32e80> <__main__.Part object at 0x2aab001f39b0> [1, 0]
rule550([adj + ITFX] + noun -> noun) [] ['ruleINTERFIX(adj)']
<__main__.Part object at 0x2aaabfa32e80> <__main__.Part object at 0x2aab001eca58> [1, 0]
rule550([num + ITFX] + noun -> noun) [] ['ruleINTERFIX(num)']
<__main__.Part object at 0x2aaabfa32e80> <__main__.Part object at 0x2aaaec781080> [1, 0]
rule552([пол] + noun + GEN -> noun) ['rule1028*(noun + GEN -> noun)'] []
<__main__.Part object at 0x2aab001de4e0> <__main__.Part object at 0x2aab001f3978> [1, 0]
rule558([noun + ITFX] + verb + тель -> noun) ['rule211(verb + тель -> noun)'] ['ruleINTERFIX(noun)']
<__main__.Part object at 0x2aab001ec668> <__main__.Part object at 0x2aab001f39b0> [1, 0]
rule558([adj + ITFX] + verb + тель -> noun) ['rule211(verb + тель -> noun)'] ['ruleINTERFIX(adj)']
<__main__.Part object at 0x2aab001ec668> <__main__.Part object at 0x2aab001eca58> [1, 0]
rule5

In [19]:
head_parts['rule619*(noun + н1(ый) -> adj)'].fsm.states

[{}]

In [20]:
with open('compo_new_cor.txt', 'w') as f:
    for l in analyzed:
        w, p, nm, (ff, pf), iff, (s, ps), ss = l
        f.writelines('\t'.join([w, p, nm, ff, pf, iff, s, ps, ss]) + '\n')

In [19]:
len(analyzed)

799445

In [21]:
fixed = []
for l in analyzed:
    w, p, nm, (ff, pf), iff, (s, ps), ss = l
    if w.find('-') != -1:
        fixed.append(l)
    else:
        if len(ff) >= 3 and len(s) >= 3:
            fixed.append(l)

In [24]:
with open('compo_new_cor3+.txt', 'w') as f:
    for l in fixed:
        w, p, nm, (ff, pf), iff, (s, ps), ss = l
        f.writelines('\t'.join([w, p, nm, ff, pf, iff, s, ps, ss]) + '\n')

In [22]:
len(fixed)

482894

In [23]:
fixed[::10000]

[('абажуродержатель',
  'noun',
  'rule550([noun + ITFX] + noun -> noun)',
  ('абажур', 'noun'),
  'ruleINTERFIX(noun)',
  ('держатель', 'noun'),
  'noun'),
 ('Волоста',
  'noun',
  'rule550([noun + ITFX] + noun -> noun)',
  ('Вия', 'noun'),
  'ruleINTERFIX(noun)',
  ('Лоста', 'noun'),
  'noun'),
 ('кинодебют',
  'noun',
  'rule550([noun + ITFX] + noun -> noun)',
  ('Кин', 'noun'),
  'ruleINTERFIX(noun)',
  ('дебют', 'noun'),
  'noun'),
 ('микрофильмотека',
  'noun',
  'rule550([noun + ITFX] + noun -> noun)',
  ('микрофильм', 'noun'),
  'ruleINTERFIX(noun)',
  ('Тека', 'noun'),
  'noun'),
 ('прерафаэлит',
  'noun',
  'rule550([noun + ITFX] + noun -> noun)',
  ('Пери', 'noun'),
  'ruleINTERFIX(noun)',
  ('рафаэлит', 'noun'),
  'noun'),
 ('сенотаска',
  'noun',
  'rule550([noun + ITFX] + noun -> noun)',
  ('сена', 'noun'),
  'ruleINTERFIX(noun)',
  ('таска', 'noun'),
  'noun'),
 ('цезаролит',
  'noun',
  'rule550([noun + ITFX] + noun -> noun)',
  ('цезарь', 'noun'),
  'ruleINTERFIX(noun)

In [181]:
rules_used = set()
for l in analyzed:
    w, p, nm, (ff, pf), iff, (s, ps), ss = l
    rules_used.add(nm)

In [182]:
rules_used

{'rule1026([полу] + adv -> adv)',
 'rule550([adj + ITFX] + noun -> noun)',
 'rule550([noun + ITFX] + noun -> noun)',
 'rule550([num + ITFX] + noun -> noun)',
 'rule552([пол] + noun + GEN -> noun)',
 'rule570([adj + ITFX] + noun + ник/атник/арник/овник -> noun)',
 'rule570([noun + ITFX] + noun + ник/атник/арник/овник -> noun)',
 'rule570([num + ITFX] + noun + ник/атник/арник/овник -> noun)',
 'rule571([adj + ITFX] + noun + щик/чик/овщик -> noun)',
 'rule572([adj + ITFX] + noun + к(а) -> noun)',
 'rule572([noun + ITFX] + noun + к(а) -> noun)',
 'rule572([num + ITFX] + noun + к(а) -> noun)',
 'rule573([adj + ITFX] + noun + j(е) -> noun)',
 'rule573([noun + ITFX] + noun + j(е) -> noun)',
 'rule573([num + ITFX] + noun + j(е) -> noun)',
 'rule576([adj + ITFX] + noun + ан-ин/чан-ин -> noun)',
 'rule578([noun + ITFX] + noun + иj(а) -> noun)',
 'rule580([adj + ITFX] + noun + 0 -> noun)',
 'rule580([noun + ITFX] + noun + 0 -> noun)',
 'rule580([num + ITFX] + noun + 0 -> noun)',
 'rule754([adj + 

In [16]:
class CompoundAnalyzer:
    def __init__(self, name, pos, left, right):
        self.name = name
        self.pos = pos
        self.left = left
        self.right = right
    
    def analyze(self, word, pos):
        if pos != self.pos:
            return []
        left_res = self.left.analyze_word(word)
        final_res = []
        for st, left, left_rule in left_res:
            if st == len(word):
                continue
            if word[st] == '-':
                st += 1
            right_res = self.right.analyze_word(word[st:])
            for fi, right, right_rule in right_res:
                if st + fi == len(word):
                    final_res.append((word, self.pos, self.name, left, left_rule, right, right_rule))
        return final_res

In [95]:
ca = CompoundAnalyzer(derivator.rules_compound[52].name, derivator.rules_compound[52].pos_a, num_left, adj_right)

NameError: name 'num_left' is not defined

In [96]:
ca.analyze('двухметровый', 'adj')

NameError: name 'ca' is not defined

In [123]:
from tqdm import tqdm_notebook as tqdm

In [124]:
analyzed = []
for word in tqdm(adj_wordlist):
    analyzed.extend(ca.analyze(word, 'adj'))

HBox(children=(IntProgress(value=0, max=47760), HTML(value='')))




In [125]:
len(analyzed)

1865

In [5]:
def get_words(rule, word_b, pos_b, words_m, poss_m, pos_a, **kwargs):    
    tags = [derivator.tag_guesser.guess(word=word_m, pos=pos_m) or [{}]
            for (word_m, pos_m) in zip([word_b] + words_m, [pos_b] + poss_m)]
    results = set()
    for tag_comb in product(*tags):
        results |= rule.apply_with_tags(word_b=word_b, pos_b=pos_b, pos_a=pos_a, words_m=words_m, poss_m=poss_m, tags_dict=tag_comb, **kwargs)
    return results

In [6]:
get_words(rule, 'проходить', 'verb', ['первый'], ['adj'], 'noun', use_rare=True)

{'первопроходец'}