In [52]:
import pickle
import subprocess
import sys
import nltk
from nltk import Nonterminal, nonterminals, Production, CFG

from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
names_extractor = NamesExtractor(morph_vocab)

In [53]:
def tokenize(sentence):
    doc = Doc(sentence)
    doc.segment(segmenter)
    l = []
    for token in doc.tokens:
        l += [token.text]
    return l

In [54]:
def get_parts_of_speech(sentence):
    doc = Doc(sentence)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    
    parts_of_speech = {}
    l = []
    for token in doc.tokens:
        parts_of_speech[token.pos] = []
    for key in parts_of_speech:
        for token in doc.tokens:
            if key == token.pos:
                l += [token.text]
                parts_of_speech[key] = l
        l = []
    return parts_of_speech

In [132]:
def create_tree(sentence: str) -> CFG:
    grammar = """
    S -> NP VP | NP VP C VP
    VP -> V NP | VP NP | Adv VP | V NP PrepP | VP NP PrepP | V V
    NP -> N | Pronoun | AP NP
    PrepP -> Prep NP
    AP -> Adj | Adv Adj
    """
#     grammar = """
#     S -> NP VP
#     NP -> Adj N | N | Pronoun
#     VP -> V NP | VP NP
#     """
    parts_of_speech = get_parts_of_speech(sentence)
    
    if 'VERB' in parts_of_speech:
        grammar += f"""
        V -> {' | '.join(f"'{w}'" for w in parts_of_speech["VERB"])}
        """
    if 'NOUN' in parts_of_speech:
        grammar += f"""
        N -> {' | '.join(f"'{w}'" for w in parts_of_speech["NOUN"])}
        """ 
    if 'PRON' in parts_of_speech:
        grammar += f"""
        Pronoun -> {' | '.join(f"'{w}'" for w in parts_of_speech["PRON"])}
        """
    if 'ADP' in parts_of_speech:
        grammar += f"""
        Prep -> {' | '.join(f"'{w}'" for w in parts_of_speech["ADP"])}
        """
    if 'ADV' in parts_of_speech:
        grammar += f"""
        Adv -> {' | '.join(f"'{w}'" for w in parts_of_speech["ADV"])}
        """
    if 'ADJ' in parts_of_speech:
        grammar += f"""
        Adj -> {' | '.join(f"'{w}'" for w in parts_of_speech["ADJ"])}
        """
    if 'SCONJ' in parts_of_speech:
        grammar += f"""
        C -> {' | '.join(f"'{w}'" for w in parts_of_speech["SCONJ"])}
        """
#     grammar += f"""
#     V -> {' | '.join(f"'{w}'" for w in parts_of_speech["VERB"])}
#     N -> {' | '.join(f"'{w}'" for w in parts_of_speech["NOUN"])}
#     Pronoun -> {' | '.join(f"'{w}'" for w in parts_of_speech["PRON"])}
#     Prep -> {' | '.join(f"'{w}'" for w in parts_of_speech["ADP"])}
#     Adv -> {' | '.join(f"'{w}'" for w in parts_of_speech["ADV"])}
#     Adj -> {' | '.join(f"'{w}'" for w in parts_of_speech["ADJ"])}
#     C -> {' | '.join(f"'{w}'" for w in parts_of_speech["SCONJ"])}
#     """
    return CFG.fromstring(grammar)

In [135]:
sentence = 'Мне очень нравится цвет'
tokens = tokenize(sentence)
grammar = create_tree(sentence)
print(grammar.productions())

parser = nltk.ChartParser(grammar)
trees = list(parser.parse(tokens))
print(trees[0])

[S -> NP VP, S -> NP VP C VP, VP -> V NP, VP -> VP NP, VP -> Adv VP, VP -> V NP PrepP, VP -> VP NP PrepP, VP -> V V, NP -> N, NP -> Pronoun, NP -> AP NP, PrepP -> Prep NP, AP -> Adj, AP -> Adv Adj, V -> 'нравится', N -> 'цвет', Pronoun -> 'Мне', Adv -> 'очень']
(S
  (NP (Pronoun Мне))
  (VP (Adv очень) (VP (V нравится) (NP (N цвет)))))


In [136]:
grammar.productions()

[S -> NP VP,
 S -> NP VP C VP,
 VP -> V NP,
 VP -> VP NP,
 VP -> Adv VP,
 VP -> V NP PrepP,
 VP -> VP NP PrepP,
 VP -> V V,
 NP -> N,
 NP -> Pronoun,
 NP -> AP NP,
 PrepP -> Prep NP,
 AP -> Adj,
 AP -> Adv Adj,
 V -> 'нравится',
 N -> 'цвет',
 Pronoun -> 'Мне',
 Adv -> 'очень']

# MAIN

In [115]:
sentence = input()
tokens = tokenize(sentence)
tree = create_tree(sentence)
parser = nltk.ChartParser(tree)
trees = list(parser.parse(tokens))
trees[0]

Мне очень нравятся твои волосы


ValueError: Grammar does not cover some of the input words: "'твои'".

In [81]:
if __name__ == '__main__':
    main()

я пишу письмо старому другу
(S
  (NP (Pronoun я))
  (VP
    (VP (V пишу) (NP (N письмо)))
    (NP (AP (Adj старому)) (NP (N другу)))))
